add an option to drop filter rules for a given dataset

scarlehoff · scarlehoff · commit 535f2becf1a1 · 2025-04-09T12:11:48.000+02:00
diff --git a/doc/sphinx/source/vp/filters.rst b/doc/sphinx/source/vp/filters.rst
@@ -307,6 +307,33 @@ append a list of filter rules to the rules obtained by the mechanisms described
 The value of ``added_filter_rules`` should be a list of rules with the same format as ``filter_rules``.
 
 
+.. _drop_filter_rules::
+
+Dropping filter rules for selected datasets
+-------------------------------------------
+
+Sometimes it might be necessary to drop the filter rules for a dataset while keeping all other rules intact.
+This is possible with the ``drop_filter_rules`` key, which will drop all dataset-scoped rules applying to a given dataset.
+Since ``drop_filter_rules`` is applied before ``added_filter_rules`` it can be utilized to reset the rules for a given dataset
+while keeping all other internal rules.
+
+.. code:: yaml
+
+    use_cuts: "internal"
+    pdf: "NNPDF40_nnlo_as_01180"
+
+    dataset_inputs:
+      - { dataset: ATLAS_Z0J_8TEV_PT-Y }
+      - { dataset: ATLAS_Z0J_8TEV_PT-M }
+
+    theoryid: 40_000_000
+
+    drop_internal_rules:
+      - ATLAS_Z0J_8TEV_PT-Y
+
+    actions_:
+      - groups_chi2_table
+
 
 Examples
 --------
@@ -345,13 +372,13 @@ less than NNLO (i.e LO or NLO). I check what the process type of
 
 .. code:: ipython
 
-   In [1]: from validphys.loader import Loader                                                                                                                                   
+   In [1]: from validphys.loader import Loader
 
-   In [2]: l = Loader()                                                                                                                                                          
+   In [2]: l = Loader()
 
-   In [3]: cd = l.check_commondata("CMSDY2D12")                                                                                                                                  
+   In [3]: cd = l.check_commondata("CMSDY2D12")
 
-   In [4]: cd.process_type                                                                                                                                                       
+   In [4]: cd.process_type
    Out[4]: 'EWK_RAP'
 
 Then cross check this against ``NNPDF.CommonData.kinLabels`` to see that
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -1372,7 +1372,13 @@ def parse_added_filter_rules(self, rules: (list, type(None)) = None):
         """
         return tuple(AddedFilterRule(**rule) for rule in rules) if rules else None
 
-    @functools.lru_cache
+    def parse_drop_internal_rules(self, drop_internal_rules: list | None = None):
+        """Turns drop_internal_rules into a tuple for internal caching."""
+        if drop_internal_rules is None:
+            return tuple()
+        return tuple(drop_internal_rules)
+
+    @functools.cache
     def produce_rules(
         self,
         theoryid,
@@ -1382,8 +1388,20 @@ def produce_rules(
         filter_rules=None,
         default_filter_rules_recorded_spec_=None,
         added_filter_rules: (tuple, type(None)) = None,
+        drop_internal_rules: tuple = tuple(),
     ):
-        """Produce filter rules based on the user defined input and defaults."""
+        """Produce filter rules based on the user defined input and defaults.
+
+        It is possible to overwrite or extend the internal rules from the runcard
+        using the following variables:
+
+        ``filter_rules``: tuple(rules)
+            Drop all internal rules and take these instead
+        ``added_filter_rules``: tuple(rules)
+            Extended internal rules with these
+        ``drop_internal_rules``: tuple(dataset names)
+            Drop internal dataset-specific rules, it is applied before ``added_filter_rules``
+        """
 
         theory_parameters = theoryid.get_description()
 
@@ -1397,15 +1415,20 @@ def produce_rules(
                 filter_rules = default_filter_rules_input()
 
         try:
-            rule_list = [
-                Rule(
-                    initial_data=rule,
-                    defaults=defaults,
-                    theory_parameters=theory_parameters,
-                    loader=self.loader,
+            rule_list = []
+            for rule in filter_rules:
+                # Don't load rules that are to be dropped
+                if rule.dataset in drop_internal_rules:
+                    continue
+
+                rule_list.append(
+                    Rule(
+                        initial_data=rule,
+                        defaults=defaults,
+                        theory_parameters=theory_parameters,
+                        loader=self.loader,
+                    )
                 )
-                for rule in filter_rules
-            ]
         except RuleProcessingError as e:
             raise ConfigError(f"Error Processing filter rules: {e}") from e
 
diff --git a/validphys2/src/validphys/tests/conftest.py b/validphys2/src/validphys/tests/conftest.py
@@ -16,6 +16,8 @@
 settings.register_profile("extratime", deadline=1500)
 settings.load_profile("extratime")
 
+lhapdf.setVerbosity(0)
+
 
 # Fortunately py.test works much like reportengine and providers are
 # connected by argument names.
diff --git a/validphys2/src/validphys/tests/test_filter_rules.py b/validphys2/src/validphys/tests/test_filter_rules.py
@@ -101,7 +101,7 @@ def test_good_rules():
     l = Loader()
     rules = [mkrule(inp) for inp in good_rules]
     dsnames = ['ATLAS_1JET_8TEV_R06_PTY', 'NMC_NC_NOTFIXED_EM-F2']
-    variants = ["legacy","legacy_dw"]
+    variants = ["legacy", "legacy_dw"]
     for dsname, v in zip(dsnames, variants):
         ds = l.check_dataset(
             dsname, cuts='internal', rules=tuple(rules), theoryid=THEORYID, variant=v
@@ -137,3 +137,42 @@ def test_added_rules():
     assert np.isnan(tb["empty data"].iloc[1, 1])
     assert tb["empty data"]["ndata"].iloc[0] == 0
     assert np.all(tb[1:]["fewer data"] != tb[1:]["Original"])
+
+
+def test_drop_internal_rules(data_internal_cuts_config, test_dataset="CMS_Z0J_8TEV_PT-Y"):
+    """Check that the key drop_internal_rules work as expected:
+    - Drops all cuts for a given dataset
+    - It is applied before added_filter_rules
+    """
+    assert test_dataset in [
+        i["dataset"] for i in data_internal_cuts_config["dataset_inputs"]
+    ], "If you updated the test DATA, please update this test as well"
+
+    def test_fun(**config):
+        """Use some internal validphy function which will for sure use cuts and separate
+        the results for the test dataset.
+        """
+        # Get data and predictions separated by dataset (drop grouping)
+        ret = API.group_result_central_table_no_table(**config).droplevel(0)
+        # Now separate the test dataset from the rest
+        df_test = ret.loc[test_dataset]
+        df_rest = ret.drop(index=test_dataset)
+        return df_test, df_rest
+
+    # Use internal cuts
+    def_test, def_all = test_fun(**data_internal_cuts_config)
+
+    # Drop all rules for the test dataset only
+    drop_test, drop_all = test_fun(**data_internal_cuts_config, drop_internal_rules=[test_dataset])
+
+    assert len(drop_test) > len(def_test), "Cuts have not been dropped!"
+    assert len(drop_all) == len(def_all), "Drop cuts have affected other datasets!"
+
+    # Add a new rule for this dataset while dropping all previous rules
+    new_rule = {"dataset": test_dataset, "rule": "pT >= 80"}
+    add_test, add_all = test_fun(
+        **data_internal_cuts_config,
+        added_filter_rules=[new_rule],
+        drop_internal_rules=[test_dataset]
+    )
+    assert len(new_rule) < len(drop_test), "New rule has not been added after dropping the cuts!"