Reed-CompBio
diff --git a/‎Snakefile‎
Lines changed: 7 additions & 8 deletions b/‎Snakefile‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎docs/htcondor.rst‎
Lines changed: 3 additions & 2 deletions b/‎docs/htcondor.rst‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎spras/allpairs.py‎
Lines changed: 0 additions & 2 deletions b/‎spras/allpairs.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎spras/analysis/ml.py‎
Lines changed: 4 additions & 1 deletion b/‎spras/analysis/ml.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎spras/analysis/summary.py‎
Lines changed: 1 addition & 1 deletion b/‎spras/analysis/summary.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎spras/btb.py‎
Lines changed: 2 additions & 13 deletions b/‎spras/btb.py‎
Lines changed: 2 additions & 13 deletions
diff --git a/‎spras/config/config.py‎
Lines changed: 92 additions & 21 deletions b/‎spras/config/config.py‎
Lines changed: 92 additions & 21 deletions
diff --git a/‎spras/config/dataset.py‎
Lines changed: 23 additions & 0 deletions b/‎spras/config/dataset.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎spras/config/schema.py‎
Lines changed: 3 additions & 27 deletions b/‎spras/config/schema.py‎
Lines changed: 3 additions & 27 deletions
diff --git a/‎spras/config/util.py‎
Lines changed: 13 additions & 0 deletions b/‎spras/config/util.py‎
Lines changed: 13 additions & 0 deletions
@@ -35,7 +35,6 @@ def get_dataset(_datasets, label):
 algorithms = list(algorithm_params)
 algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
 dataset_labels = list(_config.config.datasets.keys())
-
 dataset_gold_standard_node_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['node_files'] for dataset in gs['dataset_labels']]
 dataset_gold_standard_edge_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['edge_files'] for dataset in gs['dataset_labels']]
 
@@ -62,10 +61,10 @@ def write_parameter_log(algorithm, param_label, logfile):
 def write_dataset_log(dataset, logfile):
     dataset_contents = get_dataset(_config.config.datasets,dataset)
 
-    # safe_dump gives RepresenterError for an OrderedDict
-    # config file has to convert the dataset from OrderedDict to dict to avoid this
-    with open(logfile,'w') as f:
-        yaml.safe_dump(dataset_contents,f)
+    # safe_dump gives RepresenterError for a DatasetSchema
+    # config file has to convert the dataset to a dict to avoid this
+    with open(logfile, 'w') as f:
+        yaml.safe_dump(dict(dataset_contents), f)
 
 # Choose the final files expected according to the config file options.
 def make_final_input(wildcards):
@@ -156,9 +155,9 @@ rule log_datasets:
 # Input preparation needs to be rerun if these files are modified
 def get_dataset_dependencies(wildcards):
     dataset = _config.config.datasets[wildcards.dataset]
-    all_files = dataset["node_files"] + dataset["edge_files"] + dataset["other_files"]
+    all_files = dataset.node_files + dataset.edge_files + dataset.other_files
     # Add the relative file path
-    all_files = [dataset["data_dir"] + SEP + data_file for data_file in all_files]
+    all_files = [dataset.data_dir + SEP + data_file for data_file in all_files]
 
     return all_files
 
@@ -283,7 +282,7 @@ rule reconstruct:
 # Original pathway reconstruction output to universal output
 # Use PRRunner as a wrapper to call the algorithm-specific parse_output
 rule parse_output:
-    input: 
+    input:
         raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']),
         dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
     output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt'])
 
@@ -54,10 +54,11 @@ might look like:
 
 .. code:: bash
 
-   apptainer build spras-v0.6.0.sif docker://reedcompbio/spras:v0.6.0
+   apptainer build spras-v0.6.0.sif docker://reedcompbio/spras:0.6.0
 
 After running this command, a new file called ``spras-v0.6.0.sif`` will
-exist in the directory where the command was run.
+exist in the directory where the command was run. Note that the Docker
+image does not use a "v" in the tag.
 
 Submitting All Jobs to a Single EP
 ----------------------------------
 
@@ -35,8 +35,6 @@ def generate_inputs(data: Dataset, filename_map):
         # Get sources and targets for node input file
         # Borrowed code from pathlinker.py
         sources_targets = data.get_node_columns(["sources", "targets"])
-        if sources_targets is None:
-            raise ValueError("All Pairs Shortest Paths requires sources and targets")
 
         both_series = sources_targets.sources & sources_targets.targets
         for _index, row in sources_targets[both_series].iterrows():
 
@@ -459,8 +459,11 @@ def jaccard_similarity_eval(summary_df: pd.DataFrame, output_file: str | PathLik
     ax.set_yticklabels(algorithms)
     plt.colorbar(cax, ax=ax)
     # annotate each cell with the corresponding similarity value
+    # where we set the precision to be lower as the number of algorithms increases
+    n = 2
+    if len(algorithms) > 10: n = 1
     for i in range(len(algorithms)):
         for j in range(len(algorithms)):
-            ax.text(j, i, f'{jaccard_matrix.values[i, j]:.2f}', ha='center', va='center', color='white')
+            ax.text(j, i, f'{jaccard_matrix.values[i, j]:.{n}f}', ha='center', va='center', color='white')
     plt.savefig(output_png, bbox_inches="tight", dpi=DPI)
     plt.close()
@@ -8,7 +8,7 @@
 
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list, statistics_files: list) -> pd.DataFrame:
+                       algo_with_params: list[str], statistics_files: list) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
 
@@ -44,19 +44,8 @@ def generate_inputs(data, filename_map):
 
         # Get sources and write to file, repeat for targets
         # Does not check whether a node is a source and a target
-        for node_type in ['sources', 'targets']:
-            nodes = data.get_node_columns([node_type])
-            if nodes is None:
-                raise ValueError(f'No {node_type} found in the node files')
-
-            # TODO test whether this selection is needed, what values could the column contain that we would want to
-            # include or exclude?
-            nodes = nodes.loc[nodes[node_type]]
-            if node_type == "sources":
-                nodes.to_csv(filename_map["sources"], sep= '\t', index=False, columns=['NODEID'], header=False)
-            elif node_type == "targets":
-                nodes.to_csv(filename_map["targets"], sep= '\t', index=False, columns=['NODEID'], header=False)
-
+        for node_type, nodes in data.get_node_columns_separate(['sources', 'targets']).items():
+            nodes.to_csv(filename_map[node_type], sep='\t', index=False, columns=['NODEID'], header=False)
 
         # Create network file
         edges = data.get_interactome()
 
@@ -13,40 +13,86 @@
 """
 
 import copy as copy
+import functools
+import hashlib
+import importlib.metadata
 import itertools as it
-import os
+import subprocess
+import tomllib
 import warnings
+from pathlib import Path
 from typing import Any
 
 import numpy as np
 import yaml
 
 from spras.config.container_schema import ProcessedContainerSettings
-from spras.config.schema import RawConfig
-from spras.util import NpHashEncoder, hash_params_sha1_base32
+from spras.config.schema import DatasetSchema, RawConfig
+from spras.util import LoosePathLike, NpHashEncoder, hash_params_sha1_base32
 
 config = None
 
+@functools.cache
+def spras_revision() -> str:
+    """
+    Gets the revision of the current SPRAS repository. This function is meant to be user-friendly to warn for bad SPRAS installs.
+    1. If this file is inside the correct `.git` repository, we use the revision hash. This is for development in SPRAS as well as SPRAS installs via a cloned git repository.
+    2. If SPRAS was installed via a PyPA-compliant package manager, we use the hash of the RECORD file (https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file).
+        which contains the hashes of all installed files to the package.
+    """
+    clone_tip = "Make sure SPRAS is installed through the installation instructions: https://spras.readthedocs.io/en/latest/install.html."
+
+    # Check if we're inside the right git repository
+    try:
+        project_directory = subprocess.check_output(
+            ["git", "rev-parse", "--show-toplevel"],
+            encoding='utf-8',
+            # In case the CWD is not inside the actual SPRAS directory
+            cwd=Path(__file__).parent.resolve()
+        ).strip()
+
+        # We check the pyproject.toml name attribute to confirm that this is the SPRAS project. This is susceptible
+        # to false negatives, but we use this as a preliminary check against bad SPRAS installs.
+        pyproject_path = Path(project_directory, 'pyproject.toml')
+        try:
+            pyproject_toml = tomllib.loads(pyproject_path.read_text())
+            if "project" not in pyproject_toml or "name" not in pyproject_toml["project"]:
+                raise RuntimeError(f"The git top-level `{pyproject_path}` does not have the expected attributes. {clone_tip}")
+            if pyproject_toml["project"]["name"] != "spras":
+                raise RuntimeError(f"The git top-level `{pyproject_path}` is not the SPRAS pyproject.toml. {clone_tip}")
+
+            return subprocess.check_output(
+                ["git", "rev-parse", "--short", "HEAD"],
+                encoding='utf-8',
+                cwd=project_directory
+            ).strip()
+        except FileNotFoundError as err:
+            # pyproject.toml wasn't found during the `read_text` call
+            raise RuntimeError(f"The git top-level {pyproject_path} wasn't found. {clone_tip}") from err
+        except tomllib.TOMLDecodeError as err:
+            raise RuntimeError(f"The git top-level {pyproject_path} is malformed. {clone_tip}") from err
+    except subprocess.CalledProcessError:
+        try:
+            # `git` failed: use the truncated hash of the RECORD file in .dist-info instead.
+            record_path = str(importlib.metadata.distribution('spras').locate_file(f"spras-{importlib.metadata.version('spras')}.dist-info/RECORD"))
+            with open(record_path, 'rb', buffering=0) as f:
+                # Truncated to the magic value 8, the length of the short git revision.
+                return hashlib.file_digest(f, 'sha256').hexdigest()[:8]
+        except importlib.metadata.PackageNotFoundError as err:
+            # The metadata.distribution call failed.
+            raise RuntimeError(f"The spras package wasn't found: {clone_tip}") from err
+
+def attach_spras_revision(label: str) -> str:
+    return f"{label}_{spras_revision()}"
+
 # This will get called in the Snakefile, instantiating the singleton with the raw config
 def init_global(config_dict):
     global config
     config = Config(config_dict)
 
 def init_from_file(filepath):
     global config
-
-    # Handle opening the file and parsing the yaml
-    filepath = os.path.abspath(filepath)
-    try:
-        with open(filepath, 'r') as yaml_file:
-            config_dict = yaml.safe_load(yaml_file)
-    except FileNotFoundError as e:
-        raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e
-    except yaml.YAMLError as e:
-        raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e
-
-    # And finally, initialize
-    config = Config(config_dict)
+    config = Config.from_file(filepath)
 
 
 class Config:
@@ -64,7 +110,7 @@ def __init__(self, raw_config: dict[str, Any]):
         # Directory used for storing output
         self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir
         # A dictionary to store configured datasets against which SPRAS will be run
-        self.datasets = None
+        self.datasets: dict[str, DatasetSchema] = {}
         # A dictionary to store configured gold standard data against output of SPRAS runs
         self.gold_standards = None
         # The hash length SPRAS will use to identify parameter combinations.
@@ -103,6 +149,20 @@ def __init__(self, raw_config: dict[str, Any]):
 
         self.process_config(parsed_raw_config)
 
+    @classmethod
+    def from_file(cls, filepath: LoosePathLike):
+        # Handle opening the file and parsing the yaml
+        filepath = Path(filepath).absolute()
+        try:
+            with open(filepath, 'r') as yaml_file:
+                config_dict = yaml.safe_load(yaml_file)
+        except FileNotFoundError as e:
+            raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e
+        except yaml.YAMLError as e:
+            raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e
+
+        return cls(config_dict)
+
     def process_datasets(self, raw_config: RawConfig):
         """
         Parse dataset information
@@ -115,12 +175,17 @@ def process_datasets(self, raw_config: RawConfig):
         # Currently assumes all datasets have a label and the labels are unique
         # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
         # Convert to dicts to simplify the yaml logging
-        self.datasets = {}
+
+        for dataset in raw_config.datasets:
+            dataset.label = attach_spras_revision(dataset.label)
+        for gold_standard in raw_config.gold_standards:
+            gold_standard.label = attach_spras_revision(gold_standard.label)
+
         for dataset in raw_config.datasets:
             label = dataset.label
             if label.lower() in [key.lower() for key in self.datasets.keys()]:
                 raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
-            self.datasets[label] = dict(dataset)
+            self.datasets[label] = dataset
 
         # parse gold standard information
         self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}
@@ -129,8 +194,11 @@ def process_datasets(self, raw_config: RawConfig):
         dataset_labels = set(self.datasets.keys())
         gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']}
         for label in gold_standard_dataset_labels:
-            if label not in dataset_labels:
+            if attach_spras_revision(label) not in dataset_labels:
                 raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.")
+        # We attach the SPRAS revision to the individual dataset labels afterwards for a cleaner error message above.
+        for key, gold_standard in self.gold_standards.items():
+            self.gold_standards[key]["dataset_labels"] = map(attach_spras_revision, gold_standard["dataset_labels"])
 
         # Code snipped from Snakefile that may be useful for assigning default labels
         # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
@@ -186,7 +254,10 @@ def process_algorithms(self, raw_config: RawConfig):
                             run_dict[param] = float(value)
                         if isinstance(value, np.ndarray):
                             run_dict[param] = value.tolist()
-                    params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder)
+                    # Incorporates the `spras_revision` into the hash
+                    hash_run_dict = copy.deepcopy(run_dict)
+                    hash_run_dict["_spras_rev"] = spras_revision()
+                    params_hash = hash_params_sha1_base32(hash_run_dict, self.hash_length, cls=NpHashEncoder)
                     if params_hash in prior_params_hashes:
                         raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file '
                                         f'(current length {self.hash_length}).')
 
@@ -0,0 +1,23 @@
+from typing import Annotated
+
+from pydantic import AfterValidator, BaseModel, ConfigDict
+
+from spras.config.util import label_validator
+from spras.util import LoosePathLike
+
+
+class DatasetSchema(BaseModel):
+    """
+    Collection of information related to `Dataset` objects in the configuration.
+    """
+
+    # We prefer AfterValidator here to allow pydantic to run its own
+    # validation & coercion logic before we check it against our own
+    # requirements
+    label: Annotated[str, AfterValidator(label_validator("Dataset"))]
+    node_files: list[LoosePathLike]
+    edge_files: list[LoosePathLike]
+    other_files: list[LoosePathLike]
+    data_dir: LoosePathLike
+
+    model_config = ConfigDict(extra='forbid')
@@ -10,14 +10,14 @@
 - `CaseInsensitiveEnum` (see ./util.py)
 """
 
-import re
 from typing import Annotated
 
 from pydantic import AfterValidator, BaseModel, ConfigDict
 
 from spras.config.algorithms import AlgorithmUnion
 from spras.config.container_schema import ContainerSettings
-from spras.config.util import CaseInsensitiveEnum
+from spras.config.dataset import DatasetSchema
+from spras.config.util import CaseInsensitiveEnum, label_validator
 
 # Most options here have an `include` property,
 # which is meant to make disabling parts of the configuration easier.
@@ -79,30 +79,6 @@ class Analysis(BaseModel):
 # The default length of the truncated hash used to identify parameter combinations
 DEFAULT_HASH_LENGTH = 7
 
-def label_validator(name: str):
-    """
-    A validator takes in a label
-    and ensures that it contains only letters, numbers, or underscores.
-    """
-    label_pattern = r'^\w+$'
-    def validate(label: str):
-        if not bool(re.match(label_pattern, label)):
-            raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.")
-        return label
-    return validate
-
-class Dataset(BaseModel):
-    # We prefer AfterValidator here to allow pydantic to run its own
-    # validation & coercion logic before we check it against our own
-    # requirements
-    label: Annotated[str, AfterValidator(label_validator("Dataset"))]
-    node_files: list[str]
-    edge_files: list[str]
-    other_files: list[str]
-    data_dir: str
-
-    model_config = ConfigDict(extra='forbid')
-
 class GoldStandard(BaseModel):
     label: Annotated[str, AfterValidator(label_validator("Gold Standard"))]
     node_files: list[str] = []
@@ -131,7 +107,7 @@ class RawConfig(BaseModel):
 
     # See algorithms.py for more information about AlgorithmUnion
     algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this.
-    datasets: list[Dataset]
+    datasets: list[DatasetSchema]
     gold_standards: list[GoldStandard] = []
     analysis: Analysis = Analysis()
 
 
@@ -4,13 +4,26 @@
 only import this config file.
 """
 
+import re
 from enum import Enum
 from typing import Any
 
 import yaml
 from pydantic import BaseModel, ConfigDict
 
 
+def label_validator(name: str):
+    """
+    A validator takes in a label
+    and ensures that it contains only letters, numbers, or underscores.
+    """
+    label_pattern = r'^\w+$'
+    def validate(label: str):
+        if not bool(re.match(label_pattern, label)):
+            raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.")
+        return label
+    return validate
+
 # https://stackoverflow.com/a/76883868/7589775
 class CaseInsensitiveEnum(str, Enum):
     """