improved dataspec handling logic

jekoorn · jekoorn · commit cbe183a6c9ed · 2025-11-27T13:46:14.000+01:00
diff --git a/validphys2/src/validphys/comparefittemplates/comparecard.yaml b/validphys2/src/validphys/comparefittemplates/comparecard.yaml
@@ -99,6 +99,10 @@ lumi_report:
   meta: Null
   template: lumi.md
 
+excluded_datasets:
+  use_cuts: "fromfit"
+  from_: excluded
+
 template: report.md
 
 positivity:
@@ -110,10 +114,6 @@ description:
 dataset_inputs:
   from_: fit
 
-excluded_datasets:
-  use_cuts: "fromfit"
-  from_: excluded
-
 dataspecs:
   - theoryid:
       from_: current
diff --git a/validphys2/src/validphys/comparefittemplates/comparecard_excluded.yaml b/validphys2/src/validphys/comparefittemplates/comparecard_excluded.yaml
@@ -99,6 +99,12 @@ lumi_report:
   meta: Null
   template: lumi.md
 
+excluded_report:
+  meta: Null
+  use_cuts: "fromfit"
+  from_: dataspecs
+  template: excluded.md
+
 template: report_with_excluded.md
 
 positivity:
@@ -110,14 +116,6 @@ description:
 dataset_inputs:
   from_: fit
 
-excluded_datasets:
-  use_cuts: "fromfit"
-  from_: dataspecs
-
-excluded_positivity:
-  use_cuts: "fromfit"
-  from_: dataspecs
-
 dataspecs:
   - theoryid:
       from_: current
diff --git a/validphys2/src/validphys/comparefittemplates/report_with_excluded.md b/validphys2/src/validphys/comparefittemplates/report_with_excluded.md
@@ -11,15 +11,12 @@ We are comparing:
 
 {@ summarise_fits @}
 
-
-
 Datasets excluded from fit
 --------------------------
 {@with matched_excluded_datasets_by_name@}
 [Plots for {@dataset_name@}]({@plot_fancy@})
 {@endwith@}
 
-
 Code versions
 -------------
 {@fits_version_table@}
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -1000,6 +1000,7 @@ def produce_matched_datasets_from_dataspecs(self, dataspecs):
         res.sort(key=lambda x: (x["process"], x["dataset_name"]))
         return res
 
+#### JCM code
 
 #    def produce_matched_excluded_datasets_by_name(self, dataspecs):
 #        import pdb; pdb.set_trace()
@@ -1014,49 +1015,98 @@ def produce_matched_datasets_from_dataspecs(self, dataspecs):
 #        }
 #        return [{"dataset_input": i, **more_info} for i in dinputs_b[1:3]]
 
+#### JELLE code
+
+#    def produce_matched_excluded_datasets_by_name(self, dataspecs):
+#        """Like produce_matched_datasets_from_dataspecs but for all datasets excluded from the fit.""" 
+#        self._check_dataspecs_type(dataspecs)
+#        loader = Loader()
+#    
+#        all_used = []
+#    
+#        for spec in dataspecs:
+#            with self.set_context(ns=self._curr_ns.new_child(spec)):
+#                _, data_input = self.parse_from_(None, "data_input", write=False)
+#                names = {}
+#                for dsin in data_input:
+#                    cd = self.produce_commondata(dataset_input=dsin)
+#                    proc = get_info(cd).nnpdf31_process
+#                    ds = dsin.name
+#                    names[(proc, ds)] = dsin
+#                all_used.append(names)
+#
+#        union = set.union(*(set(d) for d in all_used))
+#        intersection = set.intersection(*(set(d) for d in all_used))
+#        excluded_set = union - intersection
+#
+#        excluded_datasets = []
+#        for names in all_used:
+#            for k in excluded_set:
+#                if k in names:
+#                    excluded_datasets.append(names[k])
+#
+#        more_info = {
+#            "pdfs": [i["pdf"] for i in dataspecs],
+#            "theoryid": dataspecs[0]["theoryid"],
+#            "fit": dataspecs[0]["fit"],
+#        }
+#        return [
+#            {
+#                "dataset_input": dsin, 
+#                "dataset_name": dsin.name,
+#                **more_info
+#            } 
+#            for dsin in excluded_datasets
+#        ]
+
+#### attempt to generalize
+
     def produce_matched_excluded_datasets_by_name(self, dataspecs):
-        """Like produce_matched_datasets_from_dataspecs but for all datasets excluded from the fit.""" 
+        """Return excluded datasets, each tagged with the more_info from the dataspecs they came from."""
         self._check_dataspecs_type(dataspecs)
         loader = Loader()
     
-        all_used = []
+        #  (proc, ds) -> list of (dsin, spec)
+        excluded_sets = {}
     
         for spec in dataspecs:
             with self.set_context(ns=self._curr_ns.new_child(spec)):
                 _, data_input = self.parse_from_(None, "data_input", write=False)
-                names = {}
+    
                 for dsin in data_input:
                     cd = self.produce_commondata(dataset_input=dsin)
                     proc = get_info(cd).nnpdf31_process
                     ds = dsin.name
-                    names[(proc, ds)] = dsin
-                all_used.append(names)
-
-        union = set.union(*(set(d) for d in all_used))
-        intersection = set.intersection(*(set(d) for d in all_used))
-        excluded_set = union - intersection
-
-        excluded_datasets = []
-        excluded_dataset_names = []
-        for names in all_used:
-            for k in excluded_set:
-                if k in names:
-                    excluded_datasets.append(names[k])
-                    excluded_dataset_names.append(k)
-
-        more_info = {
-            "pdfs": [i["pdf"] for i in dataspecs],
-            "theoryid": dataspecs[0]["theoryid"],
-            "fit": dataspecs[1]["fit"],
+                    key = (proc, ds)
+    
+                    if key not in excluded_sets:
+                        excluded_sets[key] = []
+                    excluded_sets[key].append((dsin, spec))
+    
+        all_keys = set(excluded_sets)
+        excluded_keys = {
+            k for k, occurences_for_key in excluded_sets.items()
+            if len(occurences_for_key) < len(dataspecs)
         }
-        return [
-            {
-                "dataset_input": dsin, 
-                "dataset_name": dsin.name,
-                **more_info
-            } 
-            for dsin in excluded_datasets
-        ]
+    
+        def build_more_info(spec):
+            return {
+                "pdfs": [i["pdf"] for i in dataspecs],
+                "theoryid": spec["theoryid"],
+                "fit": spec["fit"],
+            }
+    
+        out = []
+        for key in excluded_keys:
+            for dsin, spec in excluded_sets[key]:
+                out.append({
+                    "dataset_input": dsin,
+                    "dataset_name": dsin.name,
+                    **build_more_info(spec),
+                })
+    
+        return out
+
 
     def produce_matched_excluded_datasets_from_dataspecs(self, dataspecs):
         return self.produce_matched_excluded_datasets_by_name(dataspecs)
diff --git a/validphys2/src/validphys/scripts/vp_comparefits.py b/validphys2/src/validphys/scripts/vp_comparefits.py
@@ -233,6 +233,7 @@ def complete_mapping(self):
         if are_the_same:
             log.info("Using excluded comparecard: identical theory cuts/covmat detected")
             autosettings["template"] = "report_with_excluded.md"
+            args['config_yml'] = comparefittemplates.template_with_excluded_path
         return autosettings