trigger corr using an histogram as input (#672)

arthur-gal · Arthur · Benedikt Volkel · web-flow · commit d07aad6f2e16 · 2020-05-04T12:02:16.000+02:00
* trigger corr using an histogram as input

* Propagate trigger weighting flag

* analysis/systematics.py anyway needs further attention

* removed weighttrig flag from all databases and code using
  usetriggcorrfunc instead:
  1. usetriggcorrfunc is None: no weighting at all
  2. usetriggcorrfunc is True: weighting according to fitted function
  3. usetriggcorrfunc is False: weighting according to histogram

Co-authored-by: Arthur &lt;agal@aliceml.cern.ch&gt;
Co-authored-by: Benedikt Volkel &lt;benedikt.volkel@cern.ch&gt;
diff --git a/machine_learning_hep/analysis/analyzerdhadrons_mult.py b/machine_learning_hep/analysis/analyzerdhadrons_mult.py
@@ -168,7 +168,9 @@ def __init__(self, datap, case, typean, period):
         self.p_triggereff = datap["analysis"][self.typean].get("triggereff", [1] * 10)
         self.p_triggereffunc = datap["analysis"][self.typean].get("triggereffunc", [0] * 10)
 
-        self.apply_weights = datap["analysis"][self.typean]["triggersel"]["weighttrig"]
+        self.apply_weights = \
+                datap["analysis"][self.typean]["triggersel"].get("usetriggcorrfunc", None) \
+                is not None
         self.root_objects = []
 
         self.get_crossmb_from_path = datap["analysis"][self.typean].get("get_crossmb_from_path", \
diff --git a/machine_learning_hep/analysis/systematics.py b/machine_learning_hep/analysis/systematics.py
@@ -149,7 +149,9 @@ def __init__(self, datap, case, typean, period, run_param):
         self.s_presel_gen_eff = datap["analysis"][self.typean]["presel_gen_eff"]
         self.s_trigger_mc = datap["analysis"][self.typean]["triggersel"]["mc"]
         self.s_trigger_data = datap["analysis"][self.typean]["triggersel"]["data"]
-        self.apply_weights = datap["analysis"][self.typean]["triggersel"]["weighttrig"]
+        self.apply_weights = \
+                datap["analysis"][self.typean]["triggersel"].get("usetriggcorrfunc", None) \
+                is not None
 
         #Build names for input pickle files (data, mc_reco, mc_gen)
         self.n_reco = datap["files_names"]["namefile_reco"]
diff --git a/machine_learning_hep/data/data_prod_20200304/database_ml_parameters_D0pp_0304.yml b/machine_learning_hep/data/data_prod_20200304/database_ml_parameters_D0pp_0304.yml
@@ -297,7 +297,6 @@ D0pp:
       triggersel:
         data: "trigger_hasbit_INT7==1"
         mc: null
-        weighttrig: false
       data: &data_out_default
         runselection: [null, null, null] #FIXME
         results: [/data/DerivedResultsJets/D0kAnywithJets/vAN-20200304_ROOT6-1/zg/default/default/pp_2016_data/374_20200304-2028/resultsMBjetvspt,
@@ -482,7 +481,7 @@ D0pp:
       triggersel:
         data: "trigger_hasbit_HighMultSPD==1"
         mc: null
-        weighttrig: true
+        usetriggcorrfunc: True
 
       data:
         runselection: [null, null, HighMultSPD2018] #FIXME the last will have to be replaced by HighMultSPD2018
@@ -577,7 +576,6 @@ D0pp:
       triggersel:
         data: "trigger_hasbit_INT7==1"
         mc: null
-        weighttrig: false
 
       data:
         runselection: [null, null, null]
@@ -671,7 +669,6 @@ D0pp:
       triggersel:
         data: "trigger_hasbit_INT7==1"
         mc: null
-        weighttrig: false
 
       data:
         runselection: [null, null, null]
diff --git a/machine_learning_hep/data/data_prod_20200304/database_ml_parameters_Dspp.yml b/machine_learning_hep/data/data_prod_20200304/database_ml_parameters_Dspp.yml
@@ -294,7 +294,7 @@ Dspp:
       triggersel:
         data: "trigger_hasbit_INT7==1"
         mc: null
-        weighttrig: false
+        usetriggcorrfunc: Null
 
       data:
         runselection: [null, null, null]
@@ -403,7 +403,7 @@ Dspp:
       triggersel:
         data: "trigger_hasbit_HighMultSPD==1"
         mc: null
-        weighttrig: true
+        usetriggcorrfunc: true
 
       data:
         runselection: [null, null, HighMultSPD2018]
diff --git a/machine_learning_hep/data/data_prod_20200304/database_ml_parameters_LcpK0spp_0304.yml b/machine_learning_hep/data/data_prod_20200304/database_ml_parameters_LcpK0spp_0304.yml
@@ -372,7 +372,6 @@ LcpK0spp:
       triggersel:
         data: "trigger_hasbit_INT7==1"
         mc: null
-        weighttrig: false
       data: &data_out_default
         runselection: [null, null, null]
         results: [/data/DerivedResultsJets/LckAnywithJets_sub/vAN-20200304_ROOT6-1/ff/default/default/pp_2016_data/374_20200304-2028/resultsMBjetvspt,
@@ -464,7 +463,6 @@ LcpK0spp:
       triggersel:
         data: "trigger_hasbit_INT7==1"
         mc: null
-        weighttrig: false
       data:
         runselection: [null, null, null]
         results: [/data/DerivedResults/LckAnywithJets_sub/vAN-20200304_ROOT6-1/pp_2016_data/374_20200304-2028/resultsMBvspt_ntrkl,
@@ -559,7 +557,7 @@ LcpK0spp:
       triggersel:
         data: "trigger_hasbit_HighMultSPD==1"
         mc: null
-        weighttrig: true
+        usetriggcorrfunc: True
       data:
         runselection: [null, null, HighMultSPD2018]
         results: [/data/DerivedResults/LckAnywithJets_sub/vAN-20200304_ROOT6-1/pp_2016_data/374_20200304-2028/resultsSPDvspt,
@@ -652,7 +650,6 @@ LcpK0spp:
       triggersel:
         data: "trigger_hasbit_INT7==1"
         mc: null
-        weighttrig: false
       data:
         runselection: [null, null, null]
         results: [/data/DerivedResults/LckAnywithJets_sub/vAN-20200304_ROOT6-1/pp_2016_data/374_20200304-2028/resultsMBvspt_perc_v0m,
diff --git a/machine_learning_hep/data/data_prod_20200417/database_ml_parameters_D0pp_0417.yml b/machine_learning_hep/data/data_prod_20200417/database_ml_parameters_D0pp_0417.yml
@@ -306,7 +306,6 @@ D0pp:
       triggersel:
         data: "trigger_hasbit_INT7==1"
         mc: null
-        weighttrig: false
       data: &data_out_default
         runselection: [null, null, null] #FIXME
         results: [/data/DerivedResultsJets/D0kINTHighMultCALOwithJets/vAN-20200417_ROOT6-1/zg/default/default/pp_2016_data/405_20200417-1825/resultsMBjetvspt,
@@ -491,7 +490,7 @@ D0pp:
       triggersel:
         data: "trigger_hasbit_HighMultSPD==1"
         mc: null
-        weighttrig: true
+        usetriggcorrfunc: True
 
       data:
         runselection: [null, null, HighMultSPD2018] #FIXME the last will have to be replaced by HighMultSPD2018
@@ -586,7 +585,6 @@ D0pp:
       triggersel:
         data: "trigger_hasbit_INT7==1"
         mc: null
-        weighttrig: false
 
       data:
         runselection: [null, null, null]
@@ -680,7 +678,6 @@ D0pp:
       triggersel:
         data: "trigger_hasbit_INT7==1"
         mc: null
-        weighttrig: false
 
       data:
         runselection: [null, null, null]
diff --git a/machine_learning_hep/fitting/helpers.py b/machine_learning_hep/fitting/helpers.py
@@ -117,7 +117,7 @@ def __init__(self, database: dict, ana_type: str, file_data_name: str, file_mc_n
         self.include_reflections = ana_config.get("include_reflection", False)
 
         # Is this a trigger weighted histogram?
-        self.apply_weights = ana_config["triggersel"]["weighttrig"]
+        self.apply_weights = ana_config["triggersel"].get("usetriggcorrfunc", None) is not None
 
         # Systematics
         self.syst_pars = ana_config.get("systematics", {})
diff --git a/machine_learning_hep/fitting/simple_fit.py b/machine_learning_hep/fitting/simple_fit.py
@@ -169,7 +169,7 @@ def do_simple_fit(database, type_ana, period_number=-1, output_dir="simple_fit")
     include_reflections = fit_pars.get("include_reflection", False)
 
     # Is this a trigger weighted histogram?
-    apply_weights = fit_pars["triggersel"]["weighttrig"]
+    apply_weights = fit_pars["triggersel"].get("usetriggcorrfunc", None) is not None
 
     # 4) Misc
     # ML WP is needed to build the suffix for extracting the mass histogram
diff --git a/machine_learning_hep/processerdhadrons_mult.py b/machine_learning_hep/processerdhadrons_mult.py
@@ -74,13 +74,17 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles,
         self.event_cand_validation = datap["analysis"][self.typean].get("event_cand_validation", "")
         if "event_cand_validation" not in datap["analysis"][self.typean]:
             self.event_cand_validation = False
-        self.apply_weights = datap["analysis"][self.typean]["triggersel"]["weighttrig"]
+        self.usetriggcorrfunc = \
+                datap["analysis"][self.typean]["triggersel"].get("usetriggcorrfunc", None)
         self.weightfunc = None
-        if self.apply_weights is True and self.mcordata == "data":
+        self.weighthist = None
+        if self.usetriggcorrfunc is not None and self.mcordata == "data":
             filename = os.path.join(self.d_mcreweights, "trigger%s.root" % self.typean)
             if os.path.exists(filename):
                 weight_file = TFile.Open(filename, "read")
                 self.weightfunc = weight_file.Get("func%s_norm" % self.typean)
+                self.weighthist = weight_file.Get("hist%s_norm" % self.typean)
+                self.weighthist.SetDirectory(0)
                 weight_file.Close()
             else:
                 print("trigger correction file", filename, "doesnt exist")
@@ -89,9 +93,37 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles,
         self.maxvaluehisto = datap["analysis"][self.typean]["maxvaluehisto"]
         self.mass = datap["mass"]
 
-    def gethistonormforselevt_mult(self, df_evt, dfevtevtsel, label, var, weightfunc=None):
+    @staticmethod
+    def make_weights(col, func, hist, use_func):
+        """Helper function to extract weights
 
-        if weightfunc is not None:
+        Args:
+            col: np.array
+                array to evaluate/run over
+            func: ROOT.TF1
+                ROOT function to use for evaluation
+            hist: TH1
+                ROOT histogram used for getting weights
+            use_func: bool
+                whether or not to use func (otherwise hist)
+
+        Returns:
+            iterable
+        """
+
+        if use_func:
+            return evaluate(func, col)
+        def reg(value):
+            # warning, the histogram has empty bins at high mult.
+            # (>125 ntrkl) so a check is needed to avoid a 1/0 division
+            # when computing the inverse of the weight
+            return value if value != 0. else 1.
+        return [reg(hist.GetBinContent(hist.FindBin(iw))) for iw in col]
+
+
+    def gethistonormforselevt_mult(self, df_evt, dfevtevtsel, label, var, useweightfromfunc=None):
+
+        if useweightfromfunc is not None:
             label = label + "_weight"
         hSelMult = TH1F('sel_' + label, 'sel_' + label, self.nbinshisto,
                         self.minvaluehisto, self.maxvaluehisto)
@@ -105,14 +137,20 @@ def gethistonormforselevt_mult(self, df_evt, dfevtevtsel, label, var, weightfunc
         df_no_vtx = df_to_keep[~tag_vtx.values]
         # events with reco zvtx > 10 cm after previous selection
         df_bit_zvtx_gr10 = filter_bit_df(df_to_keep, 'is_ev_rej', [[3], [1, 2, 7, 12]])
-        if weightfunc is not None:
-            weightssel = evaluate(weightfunc, dfevtevtsel[var])
+
+
+        if useweightfromfunc is not None:
+            weightssel = self.make_weights(dfevtevtsel[var], self.weightfunc, self.weighthist,
+                                           useweightfromfunc)
+            weightsnovtx = self.make_weights(df_no_vtx[var], self.weightfunc, self.weighthist,
+                                             useweightfromfunc)
+            weightsgr10 = self.make_weights(df_bit_zvtx_gr10[var], self.weightfunc,
+                                            self.weighthist, useweightfromfunc)
+
             weightsinvsel = [1./weight for weight in weightssel]
             fill_hist(hSelMult, dfevtevtsel[var], weights=weightsinvsel)
-            weightsnovtx = evaluate(weightfunc, df_no_vtx[var])
             weightsinvnovtx = [1./weight for weight in weightsnovtx]
             fill_hist(hNoVtxMult, df_no_vtx[var], weights=weightsinvnovtx)
-            weightsgr10 = evaluate(weightfunc, df_bit_zvtx_gr10[var])
             weightsinvgr10 = [1./weight for weight in weightsgr10]
             fill_hist(hVtxOutMult, df_bit_zvtx_gr10[var], weights=weightsinvgr10)
         else:
@@ -161,10 +199,10 @@ def process_histomass_single(self, index):
             self.gethistonormforselevt_mult(dfevtorig, dfevtevtsel, \
                                        labeltrigger, self.v_var2_binning_gen)
 
-        if self.apply_weights is True and self.mcordata == "data":
+        if self.usetriggcorrfunc is not None and self.mcordata == "data":
             hselweight, hnovtxmultweight, hvtxoutmultweight = \
                 self.gethistonormforselevt_mult(dfevtorig, dfevtevtsel, \
-                    labeltrigger, self.v_var2_binning_gen, self.weightfunc)
+                    labeltrigger, self.v_var2_binning_gen, self.usetriggcorrfunc)
             hselweight.Write()
             hnovtxmultweight.Write()
             hvtxoutmultweight.Write()
@@ -175,7 +213,7 @@ def process_histomass_single(self, index):
 
         list_df_recodtrig = []
 
-        for ipt in range(self.p_nptfinbins):
+        for ipt in range(self.p_nptfinbins): # pylint: disable=too-many-nested-blocks
             bin_id = self.bin_matching[ipt]
             df = pickle.load(openfile(self.mptfiles_recoskmldec[bin_id][index], "rb"))
             if self.s_evtsel is not None:
@@ -206,8 +244,10 @@ def process_histomass_single(self, index):
                 df_bin = seldf_singlevar_inclusive(df, self.v_var2_binning, \
                                          self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2])
                 fill_hist(h_invmass, df_bin.inv_mass)
-                if self.apply_weights is True and self.mcordata == "data":
-                    weights = evaluate(self.weightfunc, df_bin[self.v_var2_binning_gen])
+                if self.usetriggcorrfunc is not None and self.mcordata == "data":
+                    weights = self.make_weights(df_bin[self.v_var2_binning_gen], self.weightfunc,
+                                                self.weighthist, self.usetriggcorrfunc)
+
                     weightsinv = [1./weight for weight in weights]
                     fill_hist(h_invmass_weight, df_bin.inv_mass, weights=weightsinv)
                 myfile.cd()