Analysis and efficiency update (#706)

benedikt-voelkel · Benedikt Volkel · web-flow · commit 806f2a28c5c5 · 2020-06-09T10:15:18.000+02:00
* add possibility to take efficiencies from a chosen multiplicity bin of
  another analysis

* start development of concise meta data object to not hide all
  information of pT, mutliplicity and ML working point in a file  or
  histogram name (see MLHEPMetaInfo in root.py)

* add an option to add missing keys in modify_dictionary in
  do_variations.py

Co-authored-by: Benedikt Volkel &lt;benedikt.volkel@cern.ch&gt;
diff --git a/machine_learning_hep/analysis/README.md b/machine_learning_hep/analysis/README.md
@@ -21,6 +21,11 @@ In order to place additional cuts before a mass histogram is filled, those have
 
 The cuts can then be accessed in `processer_<type>.process_histomass_single`. The database flag `use_cuts` is translated into the member `self.do_custom_analysis_cuts` and should be checked whether it's `True` in order to not circumvent it's purpose. Then, there is a helper function in `Processer` so if you have a dataframe corresponding to a certain pT bin, you can just do
 
+## Using efficiencies from another analysis
+
+To use the efficiencies from another analysis for a certain multiplicity bin one can use the fields `path_eff` and `mult_bin_eff` when using the analyzer class `AnalyzerDhadrons_mult`. When using this feature, both fields have to contain a list as long as the number of multiplicity bins. The first list lists the corresponding file to be used and  the second list entries are integers referring to the i'th multiplicity bin histogram inside the file. `null` entries can be used to use the efficiencies of this very analysis multiplicity bin (which is of course the default when none of the lists is present).
+
+
 ```python
 if self.do_custom_analysis_cuts:
     df = self.apply_cuts_ptbin(df, ipt)
diff --git a/machine_learning_hep/analysis/analyzerdhadrons_mult.py b/machine_learning_hep/analysis/analyzerdhadrons_mult.py
@@ -177,6 +177,25 @@ def __init__(self, datap, case, typean, period):
                                                                         None)
         self.path_for_crossmb = datap["analysis"][self.typean].get("path_for_crossmb", None)
 
+        # Take efficiencies from another analysis.
+        self.path_file_eff = datap["analysis"][self.typean].get("path_eff", None)
+        self.mult_bin_eff = datap["analysis"][self.typean].get("mult_bin_eff", None)
+
+        if (self.path_file_eff and not self.mult_bin_eff) or \
+                (not self.path_file_eff and self.mult_bin_eff):
+            # That is incoherent
+            self.logger.fatal("Either both or none of the lists \"path_eff\" and \"mult_bin_eff\"" \
+                    "must be specified")
+
+        if not self.path_file_eff:
+            self.path_file_eff = [None] * self.p_nbin2
+            self.mult_bin_eff = [None] * self.p_nbin2
+
+        if len(self.path_file_eff) != self.p_nbin2 or len(self.mult_bin_eff) != self.p_nbin2:
+            self.logger.fatal("Efficiencies are requested to be taken from another analysis. " \
+                              "Make sure lists \"path_eff\" and \"mult_bin_eff\" have the same " \
+                              "length as the number of those bins (%i).", self.p_nbin2)
+
         # Fitting
         self.fitter = None
         self.p_performval = datap["analysis"].get("event_cand_validation", None)
@@ -433,23 +452,32 @@ def makenormyields(self):
         self.loadstyle()
         #self.test_aliphysics()
         #filedataval = TFile.Open(self.f_evtnorm)
-        fileouteff = "%s/efficiencies%s%s.root" % \
-                      (self.d_resultsallpmc, self.case, self.typean)
         yield_filename = self.make_file_path(self.d_resultsallpdata, self.yields_filename, "root",
                                              None, [self.case, self.typean])
         gROOT.LoadMacro("HFPtSpectrum.C")
         from ROOT import HFPtSpectrum, HFPtSpectrum2, HFPtSpectrumRescaled
         histonorm = TH1F("histonorm", "histonorm", self.p_nbin2, 0, self.p_nbin2)
         for imult in range(self.p_nbin2):
+            # Choose where efficiencies to take from. Either this mult. bin, another mult. bin
+            # in this analysis or another mult. bin from another analysis specified explicitly
+            # by the user.
+            fileouteff = "{self.d_resultsallpmc}/efficiencies{self.case}{self.typean}.root" \
+                          if not self.path_file_eff[imult]  else self.path_file_eff[imult]
+            if not os.path.exists(fileouteff):
+                self.logger.fatal("Efficiency file %s could not be found", fileouteff)
             bineff = -1
-            if self.p_bineff is None:
+            if self.mult_bin_eff[imult] is not None:
+                bineff = self.mult_bin_eff[imult]
+                print(f"Use efficiency from bin {bineff} from file {fileouteff}")
+            elif self.p_bineff is None:
                 bineff = imult
                 print("Using efficiency for each var2 bin")
             else:
                 bineff = self.p_bineff
-                print("Using efficiency always from bin=", bineff)
-            namehistoeffprompt = "eff_mult%d" % bineff
-            namehistoefffeed = "eff_fd_mult%d" % bineff
+                print(f"Using efficiency always from bin={bineff}")
+
+            namehistoeffprompt = f"eff_mult{bineff}"
+            namehistoefffeed = f"eff_fd_mult{bineff}"
             nameyield = "hyields%d" % imult
             fileoutcrossmult = "%s/finalcross%s%smult%d.root" % \
                 (self.d_resultsallpdata, self.case, self.typean, imult)
diff --git a/machine_learning_hep/config.py b/machine_learning_hep/config.py
@@ -50,7 +50,7 @@ def update_config(database: dict, run_config: dict, database_overwrite=None): #
     # To be implemented
     if database_overwrite:
         logger.info("Updating database fields with custom user input")
-        modify_dictionary(database, database_overwrite)
+        modify_dictionary(database, database_overwrite, True)
 
     # If not an ML analysis...
     if not database["doml"]:
diff --git a/machine_learning_hep/do_variations.py b/machine_learning_hep/do_variations.py
@@ -157,14 +157,19 @@ def format_varlabel(varlabel: list, index: int, n_var: int):
     '''Format the label of a variation in a variation group.'''
     return "%s: %d" % (varlabel[0], index) if len(varlabel) != n_var else varlabel[index]
 
-def modify_dictionary(dic: dict, diff: dict):
-    '''Modify the dic dictionary using the diff dictionary.'''
+def modify_dictionary(dic: dict, diff: dict, add_not_present=False):
+    '''Modify the dic dictionary using the diff dictionary.
+
+    Add additional keys if add_not_present is True
+    '''
     for key, value in diff.items():
-        if key in dic: # Do not add keys that are not already in the original dictionary.
+        if key in dic:
             if isinstance(value, dict):
-                modify_dictionary(dic[key], value)
+                modify_dictionary(dic[key], value, add_not_present)
             else:
                 dic[key] = format_value(dic[key], value)
+        elif add_not_present:
+            dic[key] = value
         else:
             msg_warn("Key %s was not found and will be ignored." % key)
 
diff --git a/machine_learning_hep/processerdhadrons_mult.py b/machine_learning_hep/processerdhadrons_mult.py
@@ -28,6 +28,7 @@
         seldf_singlevar_inclusive, openfile
 from machine_learning_hep.utilities import mergerootfiles
 from machine_learning_hep.utilities import get_timestamp_string
+from machine_learning_hep.root import create_meta_info, write_meta_info
 #from machine_learning_hep.globalfitter import fitter
 from machine_learning_hep.processer import Processer
 from machine_learning_hep.bitwise import filter_bit_df, tag_bit_df
@@ -237,6 +238,12 @@ def process_histomass_single(self, index):
                          (self.v_var_binning, self.lpt_finbinmin[ipt],
                           self.lpt_finbinmax[ipt], self.lpt_probcutfin[bin_id],
                           self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2])
+                curr_dir = myfile.mkdir(f"bin1_{ipt}_bin2_{ibin2}")
+                meta_info = create_meta_info(self.v_var_binning, self.lpt_finbinmin[ipt],
+                                             self.lpt_finbinmax[ipt], self.v_var2_binning,
+                                             self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2],
+                                             self.lpt_probcutfin[bin_id])
+                write_meta_info(curr_dir, meta_info)
                 h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins,
                                  self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
                 h_invmass_weight = TH1F("h_invmass_weight" + suffix, "", self.p_num_bins,
diff --git a/machine_learning_hep/root.py b/machine_learning_hep/root.py
@@ -19,10 +19,92 @@
 import array
 import ast
 import numpy as np
-from ROOT import TNtuple, TFile # pylint: disable=import-error,no-name-in-module
+from ROOT import gROOT, TNtuple, TFile # pylint: disable=import-error,no-name-in-module
 from machine_learning_hep.logger import get_logger
 
 
+META_INFO = "struct MLHEPMetaInfo { \
+               Float_t firstLow; \
+               Float_t firstUp; \
+               Float_t secondLow; \
+               Float_t secondUp; \
+               Float_t MLWorkingPoint; \
+               std::string firstBinName; \
+               std::string secondBinName; \
+             };"
+gROOT.ProcessLine(META_INFO)
+
+META_INFO_NAME = "MLHEPMetaInfo"
+
+from ROOT import MLHEPMetaInfo # pylint: disable=wrong-import-position, import-error, no-name-in-module, ungrouped-imports
+
+def create_meta_info(first_name, first_low, first_up, second_name, second_low, second_up, ml_wp):
+    """Fill MLHEPMetaInfo struct
+
+    Custom MLHEP ROOT struct to store meta info
+
+    Args:
+        first_name: str
+            name of first binning variable
+        first_low: float
+            low bin value of first variable
+        first_up: float
+            up bin value of first variable
+        second_name: str
+            name of second binning variable
+        second_low: float
+            low bin value of second variable
+        second_up: float
+            up bin value of second variable
+        ml_wp: float
+            ML working point used to cut
+
+    Returns:
+        MLHEPMetaInfo
+    """
+
+    meta_info = MLHEPMetaInfo()
+    meta_info.firstBinName = first_name
+    meta_info.firstLow = first_low
+    meta_info.firstUp = first_up
+    meta_info.secondBinName = second_name
+    meta_info.secondLow = second_low
+    meta_info.secondUp = second_up
+    meta_info.MLWorkingPoint = ml_wp
+    return meta_info
+
+
+def write_meta_info(root_dir, meta_info):
+    """Write MLHEPMetaInfo to ROOT directory
+
+    Args:
+        root_dir: inheriting from TDirectory
+            ROOT directory where to write
+        meta_info: MLHEPMetaInfo
+            the meta info to be written
+    """
+    root_dir.WriteObject(meta_info, META_INFO_NAME)
+
+
+def read_meta_info(root_dir, fail_not_found=True):
+    """Read MLHEPMetaInfo
+
+    Args:
+        root_dir: inheriting from TDirectory
+            ROOT directory where to read from
+        fail_not_found: bool
+            if True fail if not found
+    Returns:
+        MLHEPMetaInfo
+    """
+
+
+    meta_info = root_dir.Get(META_INFO_NAME)
+    if not meta_info and fail_not_found:
+        get_logger().fatal("Cannot find %s in directory %s", META_INFO_NAME, root_dir.GetName())
+    return meta_info
+
+
 def read_ntuple(ntuple, variables):
     """
       Return a numpy array with the values from TNtuple.