Reed-CompBio
diff --git a/‎Snakefile‎
Lines changed: 6 additions & 6 deletions b/‎Snakefile‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎docs/htcondor.rst‎
Lines changed: 3 additions & 2 deletions b/‎docs/htcondor.rst‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎spras/allpairs.py‎
Lines changed: 0 additions & 2 deletions b/‎spras/allpairs.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎spras/analysis/ml.py‎
Lines changed: 4 additions & 1 deletion b/‎spras/analysis/ml.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎spras/btb.py‎
Lines changed: 2 additions & 13 deletions b/‎spras/btb.py‎
Lines changed: 2 additions & 13 deletions
diff --git a/‎spras/config/config.py‎
Lines changed: 21 additions & 22 deletions b/‎spras/config/config.py‎
Lines changed: 21 additions & 22 deletions
diff --git a/‎spras/config/dataset.py‎
Lines changed: 23 additions & 0 deletions b/‎spras/config/dataset.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎spras/config/schema.py‎
Lines changed: 3 additions & 27 deletions b/‎spras/config/schema.py‎
Lines changed: 3 additions & 27 deletions
diff --git a/‎spras/config/util.py‎
Lines changed: 13 additions & 0 deletions b/‎spras/config/util.py‎
Lines changed: 13 additions & 0 deletions
@@ -60,10 +60,10 @@ def write_parameter_log(algorithm, param_label, logfile):
 def write_dataset_log(dataset, logfile):
     dataset_contents = get_dataset(_config.config.datasets,dataset)
 
-    # safe_dump gives RepresenterError for an OrderedDict
-    # config file has to convert the dataset from OrderedDict to dict to avoid this
-    with open(logfile,'w') as f:
-        yaml.safe_dump(dataset_contents,f)
+    # safe_dump gives RepresenterError for a DatasetSchema
+    # config file has to convert the dataset to a dict to avoid this
+    with open(logfile, 'w') as f:
+        yaml.safe_dump(dict(dataset_contents), f)
 
 # Choose the final files expected according to the config file options.
 def make_final_input(wildcards):
@@ -154,9 +154,9 @@ rule log_datasets:
 # Input preparation needs to be rerun if these files are modified
 def get_dataset_dependencies(wildcards):
     dataset = _config.config.datasets[wildcards.dataset]
-    all_files = dataset["node_files"] + dataset["edge_files"] + dataset["other_files"]
+    all_files = dataset.node_files + dataset.edge_files + dataset.other_files
     # Add the relative file path
-    all_files = [dataset["data_dir"] + SEP + data_file for data_file in all_files]
+    all_files = [dataset.data_dir + SEP + data_file for data_file in all_files]
 
     return all_files
 
 
@@ -54,10 +54,11 @@ might look like:
 
 .. code:: bash
 
-   apptainer build spras-v0.6.0.sif docker://reedcompbio/spras:v0.6.0
+   apptainer build spras-v0.6.0.sif docker://reedcompbio/spras:0.6.0
 
 After running this command, a new file called ``spras-v0.6.0.sif`` will
-exist in the directory where the command was run.
+exist in the directory where the command was run. Note that the Docker
+image does not use a "v" in the tag.
 
 Submitting All Jobs to a Single EP
 ----------------------------------
 
@@ -35,8 +35,6 @@ def generate_inputs(data: Dataset, filename_map):
         # Get sources and targets for node input file
         # Borrowed code from pathlinker.py
         sources_targets = data.get_node_columns(["sources", "targets"])
-        if sources_targets is None:
-            raise ValueError("All Pairs Shortest Paths requires sources and targets")
 
         both_series = sources_targets.sources & sources_targets.targets
         for _index, row in sources_targets[both_series].iterrows():
 
@@ -459,8 +459,11 @@ def jaccard_similarity_eval(summary_df: pd.DataFrame, output_file: str | PathLik
     ax.set_yticklabels(algorithms)
     plt.colorbar(cax, ax=ax)
     # annotate each cell with the corresponding similarity value
+    # where we set the precision to be lower as the number of algorithms increases
+    n = 2
+    if len(algorithms) > 10: n = 1
     for i in range(len(algorithms)):
         for j in range(len(algorithms)):
-            ax.text(j, i, f'{jaccard_matrix.values[i, j]:.2f}', ha='center', va='center', color='white')
+            ax.text(j, i, f'{jaccard_matrix.values[i, j]:.{n}f}', ha='center', va='center', color='white')
     plt.savefig(output_png, bbox_inches="tight", dpi=DPI)
     plt.close()
@@ -44,19 +44,8 @@ def generate_inputs(data, filename_map):
 
         # Get sources and write to file, repeat for targets
         # Does not check whether a node is a source and a target
-        for node_type in ['sources', 'targets']:
-            nodes = data.get_node_columns([node_type])
-            if nodes is None:
-                raise ValueError(f'No {node_type} found in the node files')
-
-            # TODO test whether this selection is needed, what values could the column contain that we would want to
-            # include or exclude?
-            nodes = nodes.loc[nodes[node_type]]
-            if node_type == "sources":
-                nodes.to_csv(filename_map["sources"], sep= '\t', index=False, columns=['NODEID'], header=False)
-            elif node_type == "targets":
-                nodes.to_csv(filename_map["targets"], sep= '\t', index=False, columns=['NODEID'], header=False)
-
+        for node_type, nodes in data.get_node_columns_separate(['sources', 'targets']).items():
+            nodes.to_csv(filename_map[node_type], sep='\t', index=False, columns=['NODEID'], header=False)
 
         # Create network file
         edges = data.get_interactome()
 
@@ -17,7 +17,6 @@
 import hashlib
 import importlib.metadata
 import itertools as it
-import os
 import subprocess
 import tomllib
 import warnings
@@ -28,8 +27,8 @@
 import yaml
 
 from spras.config.container_schema import ProcessedContainerSettings
-from spras.config.schema import RawConfig
-from spras.util import NpHashEncoder, hash_params_sha1_base32
+from spras.config.schema import DatasetSchema, RawConfig
+from spras.util import LoosePathLike, NpHashEncoder, hash_params_sha1_base32
 
 config = None
 
@@ -93,19 +92,7 @@ def init_global(config_dict):
 
 def init_from_file(filepath):
     global config
-
-    # Handle opening the file and parsing the yaml
-    filepath = os.path.abspath(filepath)
-    try:
-        with open(filepath, 'r') as yaml_file:
-            config_dict = yaml.safe_load(yaml_file)
-    except FileNotFoundError as e:
-        raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e
-    except yaml.YAMLError as e:
-        raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e
-
-    # And finally, initialize
-    config = Config(config_dict)
+    config = Config.from_file(filepath)
 
 
 class Config:
@@ -123,7 +110,7 @@ def __init__(self, raw_config: dict[str, Any]):
         # Directory used for storing output
         self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir
         # A dictionary to store configured datasets against which SPRAS will be run
-        self.datasets = None
+        self.datasets: dict[str, DatasetSchema] = {}
         # A dictionary to store configured gold standard data against output of SPRAS runs
         self.gold_standards = None
         # The hash length SPRAS will use to identify parameter combinations.
@@ -162,6 +149,20 @@ def __init__(self, raw_config: dict[str, Any]):
 
         self.process_config(parsed_raw_config)
 
+    @classmethod
+    def from_file(cls, filepath: LoosePathLike):
+        # Handle opening the file and parsing the yaml
+        filepath = Path(filepath).absolute()
+        try:
+            with open(filepath, 'r') as yaml_file:
+                config_dict = yaml.safe_load(yaml_file)
+        except FileNotFoundError as e:
+            raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e
+        except yaml.YAMLError as e:
+            raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e
+
+        return cls(config_dict)
+
     def process_datasets(self, raw_config: RawConfig):
         """
         Parse dataset information
@@ -176,16 +177,14 @@ def process_datasets(self, raw_config: RawConfig):
         # Convert to dicts to simplify the yaml logging
 
         for dataset in raw_config.datasets:
+            label = dataset.label
+            if label.lower() in [key.lower() for key in self.datasets.keys()]:
+                raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
             dataset.label = attach_spras_revision(dataset.label)
         for gold_standard in raw_config.gold_standards:
             gold_standard.label = attach_spras_revision(gold_standard.label)
 
         self.datasets = {}
-        for dataset in raw_config.datasets:
-            label = dataset.label
-            if label.lower() in [key.lower() for key in self.datasets.keys()]:
-                raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
-            self.datasets[label] = dict(dataset)
 
         # parse gold standard information
         self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}
 
@@ -0,0 +1,23 @@
+from typing import Annotated
+
+from pydantic import AfterValidator, BaseModel, ConfigDict
+
+from spras.config.util import label_validator
+from spras.util import LoosePathLike
+
+
+class DatasetSchema(BaseModel):
+    """
+    Collection of information related to `Dataset` objects in the configuration.
+    """
+
+    # We prefer AfterValidator here to allow pydantic to run its own
+    # validation & coercion logic before we check it against our own
+    # requirements
+    label: Annotated[str, AfterValidator(label_validator("Dataset"))]
+    node_files: list[LoosePathLike]
+    edge_files: list[LoosePathLike]
+    other_files: list[LoosePathLike]
+    data_dir: LoosePathLike
+
+    model_config = ConfigDict(extra='forbid')
@@ -10,14 +10,14 @@
 - `CaseInsensitiveEnum` (see ./util.py)
 """
 
-import re
 from typing import Annotated
 
 from pydantic import AfterValidator, BaseModel, ConfigDict
 
 from spras.config.algorithms import AlgorithmUnion
 from spras.config.container_schema import ContainerSettings
-from spras.config.util import CaseInsensitiveEnum
+from spras.config.dataset import DatasetSchema
+from spras.config.util import CaseInsensitiveEnum, label_validator
 
 # Most options here have an `include` property,
 # which is meant to make disabling parts of the configuration easier.
@@ -79,30 +79,6 @@ class Analysis(BaseModel):
 # The default length of the truncated hash used to identify parameter combinations
 DEFAULT_HASH_LENGTH = 7
 
-def label_validator(name: str):
-    """
-    A validator takes in a label
-    and ensures that it contains only letters, numbers, or underscores.
-    """
-    label_pattern = r'^\w+$'
-    def validate(label: str):
-        if not bool(re.match(label_pattern, label)):
-            raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.")
-        return label
-    return validate
-
-class Dataset(BaseModel):
-    # We prefer AfterValidator here to allow pydantic to run its own
-    # validation & coercion logic before we check it against our own
-    # requirements
-    label: Annotated[str, AfterValidator(label_validator("Dataset"))]
-    node_files: list[str]
-    edge_files: list[str]
-    other_files: list[str]
-    data_dir: str
-
-    model_config = ConfigDict(extra='forbid')
-
 class GoldStandard(BaseModel):
     label: Annotated[str, AfterValidator(label_validator("Gold Standard"))]
     node_files: list[str] = []
@@ -131,7 +107,7 @@ class RawConfig(BaseModel):
 
     # See algorithms.py for more information about AlgorithmUnion
     algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this.
-    datasets: list[Dataset]
+    datasets: list[DatasetSchema]
     gold_standards: list[GoldStandard] = []
     analysis: Analysis = Analysis()
 
 
@@ -4,13 +4,26 @@
 only import this config file.
 """
 
+import re
 from enum import Enum
 from typing import Any
 
 import yaml
 from pydantic import BaseModel, ConfigDict
 
 
+def label_validator(name: str):
+    """
+    A validator takes in a label
+    and ensures that it contains only letters, numbers, or underscores.
+    """
+    label_pattern = r'^\w+$'
+    def validate(label: str):
+        if not bool(re.match(label_pattern, label)):
+            raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.")
+        return label
+    return validate
+
 # https://stackoverflow.com/a/76883868/7589775
 class CaseInsensitiveEnum(str, Enum):
     """