Apply additional custom cuts (#642)

benedikt-voelkel · Benedikt Volkel · web-flow · commit 0c9661e1d305 · 2020-04-23T18:08:17.000+02:00
* cuts loaded from corresponding database analysis section in Processer
  parent class

* use flag use_cuts in analysis section to switch usage on or off

* applied in derived classes via helper method
  Processer.apply_cuts_ptbin(df, ipt)
  (at the moment implemented in processerdhadrons_mult.py)

* forseen to place additional cuts ONLY in
  Processer_derived.process_histomass_single()

* remove process_histomass from processerdhadrons_mult since it is a
  duplicate of the one on processer.py

Co-authored-by: Benedikt Volkel &lt;benedikt.volkel@cern.ch&gt;
diff --git a/machine_learning_hep/analysis/README.md b/machine_learning_hep/analysis/README.md
@@ -5,6 +5,30 @@
 First of all, everything in here is basically an **Analyzer**. These objects can be handled by an `AnalysisManager`. 
 
 
+## Applying additional analysis cuts
+
+In order to place additional cuts before a mass histogram is filled, those have to be set in the corresponding analysis section in the database. There, one cut must be put per analysis pT bin. If no cut should be applied, just put `Null`. The flag `use_cuts` controls whether the cuts should be applied or not. Otherwise, cuts are formulated as strings which are directly used in a `pandas.DataFrame.query` meaning that all names used **must** exist as a column in the dataframe in the analysis. An example implementation in the database could look like
+
+```yaml
+# within an analysis section, assuming 4 pT bins
+  use_cuts: True
+  cuts:
+    - "p_prong0 > 2 or p_prong1 < 1"
+    - Null
+    - "abs(eta_cand) < 1.2"
+    - Null
+```
+
+The cuts can then be accessed in `processer_<type>.process_histomass_single`. The database flag `use_cuts` is translated into the member `self.do_custom_analysis_cuts` and should be checked whether it's `True` in order to not circumvent it's purpose. Then, there is a helper function in `Processer` so if you have a dataframe corresponding to a certain pT bin, you can just do
+
+```python
+if self.do_custom_analysis_cuts:
+    df = self.apply_cuts_ptbin(df, ipt)
+
+```
+
+which would apply the cuts defined for the `ipt`'th bin and returns the skimmed dataframe. Nothing is done when there was no cut defined and you would just get back the dataframe you put in.
+
 ## Analysis and systematic implementation and workflow
 
 A specific analysis or systematics is derived from `Analyzer`. This `AnalyzerDerived` can then implement any analysis step method. Note, that passing arguments to those methods is at the moment not supported. However, as they have access to the entire configuration via the database dictionary, this will probably not be needed as all specifics can be derived from that database.
diff --git a/machine_learning_hep/data/data_prod_20200417/database_ml_parameters_D0pp_0417.yml b/machine_learning_hep/data/data_prod_20200417/database_ml_parameters_D0pp_0417.yml
@@ -737,6 +737,16 @@ D0pp:
       nevents: null
       dodoublecross: false
 
+      # Additional cuts applied before mass histogram is filled
+      use_cuts: False
+      cuts:
+        - Null
+        - Null
+        - Null
+        - Null
+        - Null
+        - Null
+
       systematics:
         # For now don't do these things per pT bin
         max_chisquare_ndf: 2.
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
@@ -16,6 +16,7 @@
 main script for doing data processing, machine learning and analysis
 """
 import sys
+from copy import deepcopy
 import multiprocessing as mp
 import pickle
 import os
@@ -143,6 +144,10 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles,
         self.lpt_anbinmin = datap["sel_skim_binmin"]
         self.lpt_anbinmax = datap["sel_skim_binmax"]
         self.p_nptbins = len(self.lpt_anbinmin)
+        # Analysis pT bins
+        self.lpt_finbinmin = datap["analysis"][self.typean]["sel_an_binmin"]
+        self.lpt_finbinmax = datap["analysis"][self.typean]["sel_an_binmax"]
+        self.p_nptfinbins = len(self.lpt_finbinmin)
         self.lpt_model = datap["mlapplication"]["modelsperptbin"]
         self.dirmodel = datap["ml"]["mlout"]
         self.lpt_model = appendmainfoldertolist(self.dirmodel, self.lpt_model)
@@ -203,6 +208,11 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles,
  #       if os.path.exists(self.d_root) is False:
  #           self.logger.warning("ROOT tree folder is not there. Is it intentional?")
 
+        # Analysis cuts (loaded in self.process_histomass)
+        self.analysis_cuts = None
+        # Flag if they should be used
+        self.do_custom_analysis_cuts = datap["analysis"][self.typean].get("use_cuts", False)
+
     def unpack(self, file_index):
         treeevtorig = uproot.open(self.l_root[file_index])[self.n_treeevt]
         try:
@@ -392,6 +402,41 @@ def process_mergedec(self):
             if self.mcordata == "mc":
                 merge_method(self.mptfiles_gensk[ipt], self.lpt_gendecmerged[ipt])
 
+
+    def load_cuts(self):
+        """Load cuts from database
+        """
+
+        # Assume that there is a list with self.p
+        raw_cuts = self.datap["analysis"][self.typean].get("cuts", None)
+        if not raw_cuts:
+            print("No custom cuts given, hence not cutting...")
+            self.analysis_cuts = [None] * self.p_nptfinbins
+            return
+
+        if len(raw_cuts) != self.p_nptfinbins:
+            print(f"You have {self.p_nptfinbins} but you passed {len(raw_cuts)} cuts. Exit...")
+            sys.exit(1)
+
+        self.analysis_cuts = deepcopy(raw_cuts)
+
+
+    def apply_cuts_ptbin(self, df_, ipt):
+        """Helper function to cut dataframe with cuts for given pT bin
+
+        Args:
+            df: dataframe
+            ipt: int
+                i'th pT bin
+        Returns:
+            dataframe
+        """
+        if not self.analysis_cuts[ipt]:
+            return df_
+
+        return df_.query(self.analysis_cuts[ipt])
+
+
     # pylint: disable=no-member
     def process_histomass(self):
         print("Doing masshisto", self.mcordata, self.period)
@@ -402,6 +447,9 @@ def process_histomass(self):
         else:
             print("No extra selection needed since we are doing std analysis")
 
+        # Load potential custom cuts
+        self.load_cuts()
+
         create_folder_struc(self.d_results, self.l_path)
         arguments = [(i,) for i in range(len(self.l_root))]
         self.parallelizer(self.process_histomass_single, arguments, self.p_chunksizeunp)
diff --git a/machine_learning_hep/processerdhadrons_mult.py b/machine_learning_hep/processerdhadrons_mult.py
@@ -65,9 +65,6 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles,
         self.v_var2_binning_gen = datap["analysis"][self.typean]["var_binning2_gen"]
         self.corr_eff_mult = datap["analysis"][self.typean]["corrEffMult"]
 
-        self.lpt_finbinmin = datap["analysis"][self.typean]["sel_an_binmin"]
-        self.lpt_finbinmax = datap["analysis"][self.typean]["sel_an_binmax"]
-        self.p_nptfinbins = len(self.lpt_finbinmin)
         self.bin_matching = datap["analysis"][self.typean]["binning_matching"]
         #self.sel_final_fineptbins = datap["analysis"][self.typean]["sel_final_fineptbins"]
         self.s_evtsel = datap["analysis"][self.typean]["evtsel"]
@@ -177,6 +174,7 @@ def process_histomass_single(self, index):
         hvtxoutmult.Write()
 
         list_df_recodtrig = []
+
         for ipt in range(self.p_nptfinbins):
             bin_id = self.bin_matching[ipt]
             df = pickle.load(openfile(self.mptfiles_recoskmldec[bin_id][index], "rb"))
@@ -192,6 +190,10 @@ def process_histomass_single(self, index):
             list_df_recodtrig.append(df)
             df = seldf_singlevar(df, self.v_var_binning, \
                                  self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt])
+
+            if self.do_custom_analysis_cuts:
+                df = self.apply_cuts_ptbin(df, ipt)
+
             for ibin2 in range(len(self.lvar2_binmin)):
                 suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \
                          (self.v_var_binning, self.lpt_finbinmin[ipt],
@@ -251,23 +253,6 @@ def process_histomass_single(self, index):
                     df_recodtrig[df_recodtrig[self.v_ismcsignal] == 1], "MC"
                 ).write()
 
-    def process_histomass(self):
-        print("Doing masshisto", self.mcordata, self.period)
-        print("Using run selection for mass histo", \
-               self.runlistrigger, "for period", self.period)
-        if self.doml is True:
-            print("Doing ml analysis")
-        else:
-            print("No extra selection needed since we are doing std analysis")
-
-        create_folder_struc(self.d_results, self.l_path)
-        arguments = [(i,) for i in range(len(self.l_root))]
-        self.parallelizer(self.process_histomass_single, arguments, self.p_chunksizeunp)
-        tmp_merged = \
-        f"/data/tmp/hadd/{self.case}_{self.typean}/mass_{self.period}/{get_timestamp_string()}/"
-        mergerootfiles(self.l_histomass, self.n_filemass, tmp_merged)
-
-
     def get_reweighted_count(self, dfsel):
         filename = os.path.join(self.d_mcreweights, self.n_mcreweights)
         weight_file = TFile.Open(filename, "read")