Merge pull request #2387 from NNPDF/nnpdf-vp-excluded-datasets

scarlehoff · web-flow · commit a4705627137d · 2025-12-22T21:20:16.000+01:00
Excluded dataset page for vp-comparefits
diff --git a/n3fit/runcards/examples/developing.yml b/n3fit/runcards/examples/developing.yml
@@ -81,7 +81,7 @@ parameters: # This defines the parameter dictionary that is passed to the Model
   stopping_patience: 0.1 # percentage of the number of epochs
   layer_type: 'dense'
   dropout: 0.01
-  interpolation_points: 7
+  feature_scaling_points: 7
 
 fitting:
   fitbasis: CCBAR_ASYMM
diff --git a/validphys2/src/validphys/comparefittemplates/comparecard.yaml b/validphys2/src/validphys/comparefittemplates/comparecard.yaml
@@ -99,6 +99,26 @@ lumi_report:
   meta: Null
   template: lumi.md
 
+mismatched_information:
+  meta: Null
+  actions_:
+    - report
+  
+  # Datasets will go to their own page
+  mismatched_report:
+    meta: Null
+    template: mismatched.md
+
+  template_text: |
+    Mismatched datasets
+    ---------------------
+    The following plots corresponds to datasets which are not available in one of the fits.
+
+    {@with mismatched_datasets_by_name@}
+    [Plots for {@dataset_name@}]({@mismatched_report report@})
+    {@endwith@}
+
+
 template: report.md
 
 positivity:
@@ -119,6 +139,8 @@ dataspecs:
       from_: current
     speclabel:
       from_: current
+    dataset_inputs:
+      from_: fit
 
   - theoryid:
       from_: reference
@@ -128,6 +150,8 @@ dataspecs:
       from_: reference
     speclabel:
       from_: reference
+    dataset_inputs:
+      from_: fit
       
 t0_info:
   - use_t0: True
diff --git a/validphys2/src/validphys/comparefittemplates/mismatched.md b/validphys2/src/validphys/comparefittemplates/mismatched.md
@@ -0,0 +1,5 @@
+% Data-theory comparison for the mismatched dataset {@dataset_name@}
+# Absolute
+{@plot_fancy@}
+# Normalized
+{@Datanorm plot_fancy@}
diff --git a/validphys2/src/validphys/comparefittemplates/report.md b/validphys2/src/validphys/comparefittemplates/report.md
@@ -99,12 +99,17 @@ $\phi$ by {@processed_metadata_group@}
 {@plot_fits_groups_data_phi@}
 {@endwith@}
 
+
+
 Dataset plots
 -------------
 {@with matched_datasets_from_dataspecs@}
 [Plots for {@dataset_name@}]({@dataset_report report@})
 {@endwith@}
 
+[Mismatched datasets]({@mismatched_information report@})
+--------------------
+
 Positivity
 ----------
 {@with matched_positivity_from_dataspecs@}
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -36,7 +36,7 @@
     default_filter_rules_input,
     default_filter_settings_input,
 )
-from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas
+from validphys.fitdata import fitted_replica_indexes, match_datasets_by_name, num_fitted_replicas
 from validphys.gridvalues import LUMI_CHANNELS
 from validphys.loader import (
     DataNotFoundError,
@@ -981,14 +981,12 @@ def produce_matched_datasets_from_dataspecs(self, dataspecs):
         for spec in dataspecs:
             with self.set_context(ns=self._curr_ns.new_child(spec)):
                 _, data_input = self.parse_from_(None, "data_input", write=False)
-
                 names = {}
                 for dsin in data_input:
                     cd = self.produce_commondata(dataset_input=dsin)
                     proc = get_info(cd).nnpdf31_process
                     ds = dsin.name
                     names[(proc, ds)] = dsin
-
                 all_names.append(names)
         used_set = set.intersection(*(set(d) for d in all_names))
         res = []
@@ -997,13 +995,62 @@ def produce_matched_datasets_from_dataspecs(self, dataspecs):
             # TODO: Should this have the same name?
             inner_spec_list = inres["dataspecs"] = []
             for ispec, spec in enumerate(dataspecs):
-                # Passing spec by referene
+                # Passing spec by reference
                 d = ChainMap({"dataset_input": all_names[ispec][k]}, spec)
                 inner_spec_list.append(d)
             res.append(inres)
         res.sort(key=lambda x: (x["process"], x["dataset_name"]))
         return res
 
+    def produce_mismatched_datasets_by_name(self, dataspecs):
+        """
+        Like produce_matched_datasets_from_dataspecs, but for mismatched datasets from a fit comparison.
+        Returns the mismatched datasets, each tagged with more_info from the dataspecs they came from. Set up to work with plot_fancy.
+
+        Datasets are considered a mismatch if the name is different and if the variant is different.
+        """
+
+        self._check_dataspecs_type(dataspecs)
+
+        # Parse the data for the comparison so that only variant and dataset are actually tested
+        parsed_data = []
+        for spec in dataspecs:
+            tmp = [(i.name, i.variant) for i in spec["dataset_inputs"]]
+            parsed_data.append((spec, tmp))
+
+        # TODO:
+        # This is a convoluted way of checking whether there are mismatches
+        # between the lists of dataset inputs of a list of specs.
+        # This is not going to win any codegolf tournaments
+        already_mismatched = []
+        mismatched_dinputs = []
+        for spec, parsed_dinputs in parsed_data:
+            for spec_to_check, parsed_dinputs_to_check in parsed_data:
+                if spec == spec_to_check:
+                    continue
+                for i, parsed_dinput in enumerate(parsed_dinputs):
+                    # Use a list of already mismatched data to avoid duplicates
+                    if parsed_dinput in already_mismatched:
+                        continue
+                    if parsed_dinput not in parsed_dinputs_to_check:
+                        dinput = spec["dataset_inputs"][i]
+                        mismatched_dinputs.append((dinput, spec))
+                        already_mismatched.append(parsed_dinput)
+
+        res = []
+        # prepare output for plot_fancy
+        for dsin, spec in mismatched_dinputs:
+            res.append(
+                {
+                    "dataset_input": dsin,
+                    "dataset_name": dsin.name,
+                    "theoryid": spec["theoryid"],
+                    "pdfs": [i["pdf"] for i in dataspecs],
+                    "fit": spec["fit"],
+                }
+            )
+        return res
+
     def produce_matched_positivity_from_dataspecs(self, dataspecs):
         """Like produce_matched_datasets_from_dataspecs but for positivity datasets."""
         self._check_dataspecs_type(dataspecs)
@@ -1014,7 +1061,6 @@ def produce_matched_positivity_from_dataspecs(self, dataspecs):
                 names = {(p.name): (p) for p in pos}
                 all_names.append(names)
         used_set = set.intersection(*(set(d) for d in all_names))
-
         res = []
         for k in used_set:
             inres = {"posdataset_name": k}
diff --git a/validphys2/src/validphys/scripts/vp_comparefits.py b/validphys2/src/validphys/scripts/vp_comparefits.py
@@ -68,7 +68,6 @@ def add_positional_arguments(self, parser):
             help="Use LUX basis (which include the photon) for the report",
             action='store_true',
         )
-
         parser.set_defaults()
 
     def try_complete_args(self):
@@ -178,6 +177,7 @@ def get_commandline_arguments(self, cmdline=None):
             args['config_yml'] = comparefittemplates.template_pol_path
         else:
             args['config_yml'] = comparefittemplates.template_path
+
         return args
 
     def complete_mapping(self):
@@ -223,8 +223,16 @@ def complete_mapping(self):
                     'unpolarized_bc': {'from_': 'positivity_bound'},
                 }
             )
+        are_the_same = self.check_identical_theory_cuts_covmat()
+        if are_the_same:
+            log.info("Adding mismatched datasets page: identical theory, data cuts and covmat detected")
+        else:
+            autosettings["mismatched_information"] = {
+            "template_text": "Mismatched datasets cannot be shown due to cuts theory, data cuts and/or covmat not being identical"
+            }
         return autosettings
 
+
     def get_config(self):
         self.try_complete_args()
         # No error handling here because this is our internal file
@@ -234,6 +242,23 @@ def get_config(self):
             c = yaml_safe.load(f)
         c.update(self.complete_mapping())
         return self.config_class(c, environment=self.environment)
+    
+    def check_identical_theory_cuts_covmat(self):
+        """ 
+        Checks whether the theory ID, data cuts, and thcovmat are the same between the two fits.    
+        In the affirmative case, a mismatched datasets page will be added to the report.
+        """
+        args = self.args
+        l = self.environment.loader 
+        current_runcard = l.check_fit(args['current_fit']).as_input()
+        reference_runcard = l.check_fit(args['reference_fit']).as_input()
+        
+        current_thcovmat = current_runcard.get("theorycovmatconfig")
+        reference_thcovmat = reference_runcard.get("theorycovmatconfig")
+        same_theoryid = current_runcard.get("theory", {}).get("theoryid") == reference_runcard.get("theory", {}).get("theoryid")
+        same_datacuts = current_runcard.get("datacuts") == reference_runcard.get("datacuts")
+        same_thcovmat = (current_thcovmat == reference_thcovmat)
+        return same_theoryid and same_datacuts and same_thcovmat
 
 
 def main():