v2: fixes to validation and upconversion (#351)

dweindl · web-flow · commit ec584630c303 · 2024-12-20T15:12:12.000+01:00
* don't create unnecessary experiment tables
* write yaml file *after* updating the config dict
* check for missing experiments
* handle anonymous experiments
diff --git a/petab/v2/lint.py b/petab/v2/lint.py
@@ -228,11 +228,16 @@ class CheckValidPetabIdColumn(ValidationTask):
     """A task to check that a given column contains only valid PEtab IDs."""
 
     def __init__(
-        self, table_name: str, column_name: str, required_column: bool = True
+        self,
+        table_name: str,
+        column_name: str,
+        required_column: bool = True,
+        ignore_nan: bool = False,
     ):
         self.table_name = table_name
         self.column_name = column_name
         self.required_column = required_column
+        self.ignore_nan = ignore_nan
 
     def run(self, problem: Problem) -> ValidationIssue | None:
         df = getattr(problem, f"{self.table_name}_df")
@@ -248,7 +253,10 @@ def run(self, problem: Problem) -> ValidationIssue | None:
             return
 
         try:
-            check_ids(df[self.column_name].values, kind=self.column_name)
+            ids = df[self.column_name].values
+            if self.ignore_nan:
+                ids = ids[~pd.isna(ids)]
+            check_ids(ids, kind=self.column_name)
         except ValueError as e:
             return ValidationError(str(e))
 
@@ -308,21 +316,26 @@ def run(self, problem: Problem) -> ValidationIssue | None:
         except AssertionError as e:
             return ValidationError(str(e))
 
-        # TODO: introduce some option for validation partial vs full
+        # TODO: introduce some option for validation of partial vs full
         #  problem. if this is supposed to be a complete problem, a missing
         #  condition table should be an error if the measurement table refers
-        #  to conditions
-
-        # check that measured experiments
-        if problem.experiment_df is None:
-            return
-
+        #  to conditions, otherwise it should maximally be a warning
         used_experiments = set(problem.measurement_df[EXPERIMENT_ID].values)
-        available_experiments = set(
-            problem.experiment_df[EXPERIMENT_ID].unique()
+        # handle default-experiment
+        used_experiments = set(
+            filter(
+                lambda x: not pd.isna(x),
+                used_experiments,
+            )
+        )
+        # check that measured experiments exist
+        available_experiments = (
+            set(problem.experiment_df[EXPERIMENT_ID].unique())
+            if problem.experiment_df is not None
+            else set()
         )
         if missing_experiments := (used_experiments - available_experiments):
-            raise AssertionError(
+            return ValidationError(
                 "Measurement table references experiments that "
                 "are not specified in the experiments table: "
                 + str(missing_experiments)
@@ -826,6 +839,7 @@ def append_overrides(overrides):
     CheckMeasurementTable(),
     CheckConditionTable(),
     CheckExperimentTable(),
+    CheckValidPetabIdColumn("measurement", EXPERIMENT_ID, ignore_nan=True),
     CheckValidPetabIdColumn("experiment", EXPERIMENT_ID),
     CheckValidPetabIdColumn("experiment", CONDITION_ID),
     CheckExperimentConditionsExist(),
diff --git a/petab/v2/petab1to2.py b/petab/v2/petab1to2.py
@@ -9,14 +9,11 @@
 import pandas as pd
 from pandas.io.common import get_handle, is_url
 
-import petab.v1.C
-from petab.models import MODEL_TYPE_SBML
-from petab.v1 import Problem as ProblemV1
-from petab.yaml import get_path_prefix
-
 from .. import v1, v2
-from ..v1.yaml import load_yaml, validate, write_yaml
+from ..v1 import Problem as ProblemV1
+from ..v1.yaml import get_path_prefix, load_yaml, validate, write_yaml
 from ..versions import get_major_version
+from .models import MODEL_TYPE_SBML
 
 __all__ = ["petab1to2"]
 
@@ -63,18 +60,18 @@ def petab1to2(yaml_config: Path | str, output_dir: Path | str = None):
     if get_major_version(yaml_config) != 1:
         raise ValueError("PEtab problem is not version 1.")
     petab_problem = ProblemV1.from_yaml(yaml_file or yaml_config)
+    # get rid of conditionName column if present (unsupported in v2)
+    petab_problem.condition_df = petab_problem.condition_df.drop(
+        columns=[v1.C.CONDITION_NAME], errors="ignore"
+    )
     if v1.lint_problem(petab_problem):
         raise ValueError("Provided PEtab problem does not pass linting.")
 
+    output_dir = Path(output_dir)
+
     # Update YAML file
     new_yaml_config = _update_yaml(yaml_config)
 
-    # Write new YAML file
-    output_dir = Path(output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    new_yaml_file = output_dir / Path(yaml_file).name
-    write_yaml(new_yaml_config, new_yaml_file)
-
     # Update tables
     # condition tables, observable tables, SBML files, parameter table:
     #  no changes - just copy
@@ -104,6 +101,19 @@ def petab1to2(yaml_config: Path | str, output_dir: Path | str = None):
         def create_experiment_id(sim_cond_id: str, preeq_cond_id: str) -> str:
             if not sim_cond_id and not preeq_cond_id:
                 return ""
+            # check whether the conditions will exist in the v2 condition table
+            sim_cond_exists = (
+                petab_problem.condition_df.loc[sim_cond_id].notna().any()
+            )
+            preeq_cond_exists = (
+                preeq_cond_id
+                and petab_problem.condition_df.loc[preeq_cond_id].notna().any()
+            )
+            if not sim_cond_exists and not preeq_cond_exists:
+                # if we have only all-NaN conditions, we don't create a new
+                #  experiment
+                return ""
+
             if preeq_cond_id:
                 preeq_cond_id = f"{preeq_cond_id}_"
             exp_id = f"experiment__{preeq_cond_id}__{sim_cond_id}"
@@ -126,6 +136,8 @@ def create_experiment_id(sim_cond_id: str, preeq_cond_id: str) -> str:
             sim_cond_id = row[v1.C.SIMULATION_CONDITION_ID]
             preeq_cond_id = row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, "")
             exp_id = create_experiment_id(sim_cond_id, preeq_cond_id)
+            if not exp_id:
+                continue
             if preeq_cond_id:
                 experiments.append(
                     {
@@ -165,10 +177,8 @@ def create_experiment_id(sim_cond_id: str, preeq_cond_id: str) -> str:
             # add pre-eq condition id if not present or convert to string
             #  for simplicity
             if v1.C.PREEQUILIBRATION_CONDITION_ID in measurement_df.columns:
-                measurement_df[
-                    v1.C.PREEQUILIBRATION_CONDITION_ID
-                ] = measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID].astype(
-                    str
+                measurement_df.fillna(
+                    {v1.C.PREEQUILIBRATION_CONDITION_ID: ""}, inplace=True
                 )
             else:
                 measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = ""
@@ -177,7 +187,7 @@ def create_experiment_id(sim_cond_id: str, preeq_cond_id: str) -> str:
                 petab_problem.condition_df is not None
                 and len(
                     set(petab_problem.condition_df.columns)
-                    - {petab.v1.C.CONDITION_NAME}
+                    - {v1.C.CONDITION_NAME}
                 )
                 == 0
             ):
@@ -209,6 +219,10 @@ def create_experiment_id(sim_cond_id: str, preeq_cond_id: str) -> str:
                 measurement_df, get_dest_path(measurement_file)
             )
 
+    # Write new YAML file
+    new_yaml_file = output_dir / Path(yaml_file).name
+    write_yaml(new_yaml_config, new_yaml_file)
+
     # validate updated Problem
     validation_issues = v2.lint_problem(new_yaml_file)