fixes

Luigi Dello Stritto · Luigi Dello Stritto · commit 0d1cc473f3bb · 2025-02-07T12:17:53.000+01:00
diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
@@ -231,7 +231,7 @@ def calculate_efficiencies(self):
                 self._save_canvas(c, f'eff/h_ptjet-pthf_eff_{cat}_ptjet.png')
 
             # Run 3 efficiencies
-            for cat in cats:
+            for icat, cat in enumerate(cats):
                 # gen-level efficiency for feeddown estimation
                 h_eff_gen = h_genmatch[cat].Clone()
                 h_eff_gen.Divide(h_gen[cat])
@@ -269,13 +269,24 @@ def calculate_efficiencies(self):
 
                 eff = h_det[cat].Clone(f'h_effnew_{cat}')
                 ensure_sumw2(eff)
-                eff.Divide(h_out)
+                eff.Divide(h_out) #apply correction here. 2 axes pt hf
+
+                if eff_corr := self.cfg('efficiency.reweight'):
+                    for iptjet in range(get_nbins(eff, 0)):
+                        for ipt in range(get_nbins(eff, 1)):
+                            scale_bin(eff, eff_corr[ipt][icat], iptjet+1, ipt+1)
+
                 self._save_hist(eff, f'eff/h_ptjet-pthf_effnew_{cat}.png')
                 self.h_effnew_ptjet_pthf[cat] = eff
 
                 eff_avg = project_hist(h_det[cat], [1], {0: bins_ptjet})
                 ensure_sumw2(eff_avg)
                 eff_avg.Divide(project_hist(h_out, [1], {0: bins_ptjet}))
+
+                if eff_corr := self.cfg('efficiency.reweight'):
+                        for ipt in range(get_nbins(eff_avg, 0)):
+                            scale_bin(eff_avg, eff_corr[ipt][icat], ipt+1)
+
                 self._save_hist(eff_avg, f'eff/h_pthf_effnew_{cat}.png')
                 self.h_effnew_pthf[cat] = eff_avg
 
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml
@@ -21,7 +21,7 @@ LcJet_pp:
     # sel_reco_skim: ["mlPromptScore > 0.96", "mlPromptScore > 0.97", "mlPromptScore > 0.9", "mlPromptScore > 0.85", "mlPromptScore > 0.8", "mlPromptScore > 0.6", null]  # (sel_skim_binmin bins)
     sel_reco_skim: [null, null, null, null, null, null, null, null, null, null, null] # (sel_skim_binmin bins)
     sel_gen_skim: [null, null, null, null, null, null, null, null, null, null, null] # (sel_skim_binmin bins)
-    sel_skim_binmin: [1, 2, 3, 4, 5, 6, 7, 8, 10, 12,] # skimming pt bins (sel_skim_binmin bins)
+    sel_skim_binmin: [1, 2, 3, 4, 5, 6, 7, 8, 10, 12] # skimming pt bins (sel_skim_binmin bins)
     sel_skim_binmax: [2, 3, 4, 5, 6, 7, 8, 10, 12, 24] # skimming pt bins (sel_skim_binmin bins)
     var_binning: fPt
     dofullevtmerge: false
@@ -582,8 +582,8 @@ LcJet_pp:
                 xgboost_classifierLcpKpi_dfselection_fPt_12.0_24.0.sav
             ] # sel_skim_binmin bins
         probcutpresel:
-            data: [[0.05, 0.0, 0.0], [0.05, 0.0, 0.0], [0.08, 0.0, 0.0], [0.1, 0.0, 0.0], [0.2, 0.0, 0.0], [0.2, 0.0, 0.0], [0.2, 0.0, 0.0], [0.25, 0.0, 0.0], [0.35, 0.0, 0.0], [0.4, 0.0, 0.0]] #list of nbins
-            mc: [[0.05, 0.0, 0.0], [0.05, 0.0, 0.0], [0.08, 0.0, 0.0], [0.1, 0.0, 0.0], [0.2, 0.0, 0.0], [0.2, 0.0, 0.0], [0.2, 0.0, 0.0], [0.25, 0.0, 0.0], [0.35, 0.0, 0.0], [0.4, 0.0, 0.0]] #list of nbins
+            data: [[0.02, 0.0, 0.0], [0.03, 0.0, 0.0], [0.04, 0.0, 0.0], [0.07, 0.0, 0.0], [0.09, 0.0, 0.0], [0.11, 0.0, 0.0], [0.15, 0.0, 0.0], [0.18, 0.0, 0.0], [0.25, 0.0, 0.0], [0.35, 0.0, 0.0]] #list of nbins
+            mc: [[0.02, 0.0, 0.0], [0.03, 0.0, 0.0], [0.04, 0.0, 0.0], [0.07, 0.0, 0.0], [0.09, 0.0, 0.0], [0.11, 0.0, 0.0], [0.15, 0.0, 0.0], [0.18, 0.0, 0.0], [0.25, 0.0, 0.0], [0.35, 0.0, 0.0]] #list of nbins
         probcutoptimal: [[0.02, 0.0, 0.0], [0.03, 0.0, 0.0], [0.04, 0.0, 0.0], [0.07, 0.0, 0.0], [0.09, 0.0, 0.0], [0.11, 0.0, 0.0], [0.15, 0.0, 0.0], [0.18, 0.0, 0.0], [0.25, 0.0, 0.0], [0.35, 0.0, 0.0]] #list of nbins
 
     #region analysis
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi_newformat_mult_ana.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi_newformat_mult_ana.yml
@@ -63,7 +63,6 @@ LcpKpi:
         trees:
           O2hflcmccollbase: [fPosX, fPosY, fPosZ, fCentFT0M]
           O2hflcmcrcollid: [fIndexArrayHFLCCOLLBASES]
-        rename: {old: fCentFT0M, new: fCentFT0Mmc}
 
       reco:
         level: all
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
@@ -189,33 +189,28 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab
         # Potentially mask certain values (e.g. nsigma TOF of -999)
         self.p_mask_values = datap["ml"].get("mask_values", None)
 
-        self.lpt_probcutpre = datap["mlapplication"]["probcutpresel"][self.mcordata]
-        self.lpt_probcutfin = datap["analysis"][self.typean].get("probcuts", None)
-
         self.bins_skimming = np.array(list(zip(self.lpt_anbinmin, self.lpt_anbinmax)), 'd')
         self.bins_analysis = np.array(list(zip(self.lpt_finbinmin, self.lpt_finbinmax)), 'd')
         bin_matching = [
             [ptrange[0] <= bin[0] and ptrange[1] >= bin[1] for ptrange in self.bins_skimming].index(True)
             for bin in self.bins_analysis
         ]
 
-        # Make it backwards-compatible
-        if not self.lpt_probcutfin:
-            lpt_probcutfin_tmp = datap["mlapplication"]["probcutoptimal"]
-            self.lpt_probcutfin = []
-            for i in range(self.p_nptfinbins):
-                bin_id = bin_matching[i]
-                self.lpt_probcutfin.append(lpt_probcutfin_tmp[bin_id])
+        self.lpt_probcutpre = datap["mlapplication"]["probcutpresel"][self.mcordata]
+        lpt_probcutfin_tmp = datap["mlapplication"]["probcutoptimal"]
+        self.lpt_probcutfin = [lpt_probcutfin_tmp[bin_matching[ibin]]
+            for ibin in range(self.p_nptfinbins)]
 
-        if self.mltype == "MultiClassification":
-            for probcutfin, probcutpre in zip(self.lpt_probcutfin, self.lpt_probcutpre):
+        for ibin, probcutfin in enumerate(self.lpt_probcutfin):
+            probcutpre = self.lpt_probcutpre[bin_matching[ibin]]
+            if self.mltype == "MultiClassification":
                 if probcutfin[0] > probcutpre[0] or probcutfin[1] < probcutpre[1] or probcutfin[2] < probcutpre[2]:
                     self.logger.fatal("Probability cut final: %s must be tighter than presel %s!\n" \
                             "Verify that bkg prob presel > final, and other cuts presel < final",
                             self.lpt_probcutfin, self.lpt_probcutpre)
-        elif self.lpt_probcutfin < self.lpt_probcutpre:
-            self.logger.fatal("Probability cut final: %s must be tighter (smaller values) than presel %s!",
-                    self.lpt_probcutfin, self.lpt_probcutpre)
+            elif probcutfin < probcutpre:
+                self.logger.fatal("Probability cut final: %s must be tighter (smaller values) than presel %s!",
+                        self.lpt_probcutfin, self.lpt_probcutpre)
 
         if self.mltype == "MultiClassification":
             self.l_selml = []
@@ -418,10 +413,6 @@ def dfuse(df_spec):
                             dfs[df_name][var] = np.logical_and(dfs[df_name][var] == 1, swapped)
                 self.logger.debug(' %s -> done', df_name)
 
-                if 'rename' in df_spec:
-                    spec = df_spec['rename']
-                    dfs[df_name] = dfs[df_name].rename(columns={spec['old']: spec['new']})
-
 
         if self.df_merge:
             for m_spec in self.df_merge:
@@ -433,18 +424,18 @@ def dfuse(df_spec):
                         self.logger.info('merging %s with %s on %s into %s', base, ref, on, out)
                         if not isinstance(on, list) or 'df' not in on:
                             on = ['df', on]
-                        dfs[out] = dfmerge(dfs[base], dfs[ref], on=on)
+                        dfs[out] = dfmerge(dfs[base], dfs[ref], suffixes=(f'_{base}', None), on=on)
                     elif (on := m_spec.get('left_on', None)) is not None:
                         self.logger.info('merging %s with %s on %s into %s', base, ref, on, out)
                         if not is_numeric_dtype(dfs[base][on]):
                             self.logger.info('exploding dataframe %s on variable %s', base, on)
-                            dfs[out] = dfmerge(dfs[base].explode(on), dfs[ref], left_on=['df', on], right_index=True)
+                            dfs[out] = dfmerge(dfs[base].explode(on), dfs[ref], left_on=['df', on], suffixes=(f'_{base}', None), right_index=True)
                         else:
-                            dfs[out] = dfmerge(dfs[base], dfs[ref], left_on=['df', on], right_index=True)
+                            dfs[out] = dfmerge(dfs[base], dfs[ref], left_on=['df', on], suffixes=(f'_{base}', None), right_index=True)
                     else:
                         var = self.df_read[ref]['index']
                         self.logger.info('merging %s with %s on %s (default) into %s', base, ref, var, out)
-                        dfs[out] = dfmerge(dfs[base], dfs[ref], left_on=['df', var], right_index=True)
+                        dfs[out] = dfmerge(dfs[base], dfs[ref], left_on=['df', var], suffixes=(f'_{base}', None), right_index=True)
                     if 'extra' in m_spec:
                         self.logger.debug(' %s -> extra', out)
                         for col_name, col_val in m_spec['extra'].items():
@@ -462,9 +453,7 @@ def dfuse(df_spec):
     def skim(self, file_index):
         dfreco = read_df(self.l_reco[file_index])
         dfgen = read_df(self.l_gen[file_index]) if self.mcordata == 'mc' else None
-
-        if self.n_gen_sl:
-            dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.mcordata == 'mc' else None
+        dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.mcordata == 'mc' else None
 
         for ipt in range(self.p_nptbins):
             dfrecosk = seldf_singlevar(dfreco, self.v_var_binning,
@@ -478,7 +467,7 @@ def skim(self, file_index):
                 dfgensk = dfquery(dfgensk, self.s_gen_skim[ipt])
                 write_df(dfgensk, self.mptfiles_gensk[ipt][file_index])
 
-            if self.n_gen_sl:
+            if dfgen_sl is not None:
                 dfgensk_sl = seldf_singlevar(dfgen_sl, self.v_var_binning,
                                           self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
                 dfgensk_sl = dfquery(dfgensk_sl, self.s_gen_skim[ipt])