From 13824a679841c05338b77817b0f516c75a7a89f4 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 23 Jun 2025 20:31:59 +0000 Subject: [PATCH 01/60] chore: re-refactor PRM properties --- CONTRIBUTING.md | 2 -- spras/allpairs.py | 1 + spras/domino.py | 1 + spras/meo.py | 1 + spras/mincostflow.py | 1 + spras/omicsintegrator1.py | 1 + spras/pathlinker.py | 1 + spras/prm.py | 22 +++++++++++++++------- 8 files changed, 21 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 02f5e5a45..41eb155de 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -142,8 +142,6 @@ and your editor's interpreter is set to using the SPRAS environment over the bas Note the behaviors of the `request_node_columns` function when there are missing values in that column of the node table and when multiple columns are requested. `request_node_columns` always returns the `NODEID` column in addition to the requested columns. -Note: If you encounter a `'property' object is not iterable` error arising from inside the Snakefile, this means that `required_inputs` is not set. This is because when `required_inputs` is not set inside an algorithm wrapper, it falls back to the underlying unimplemented function inside the PRM base class, which, while it is marked as a property function, is non-static; therefore, when the runner utility class tries to dynamically fetch `required_inputs` with reflection, it ends up grabbing the `property` function instead of the underlying error, and tries to iterate over it (since `required_inputs` is usually a list.) - Now implement the `generate_inputs` function. Start by inspecting the `omicsintegrator1.py` example, but note the differences in the expected file formats generated for the two algorithms with respect to the header rows and node prize column. The selected nodes should be any node in the dataset that has a prize set, any node that is active, any node that is a source, or any node that is a target. diff --git a/spras/allpairs.py b/spras/allpairs.py index adae77dbf..ea0ca5821 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -12,6 +12,7 @@ class AllPairs(PRM): required_inputs = ['nodetypes', 'network'] + doi = [] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/domino.py b/spras/domino.py index 2364300aa..19f4b591e 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -28,6 +28,7 @@ """ class DOMINO(PRM): required_inputs = ['network', 'active_genes'] + doi = ["10.15252/msb.20209593"] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/meo.py b/spras/meo.py index 172aa0363..ae97b6dec 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -84,6 +84,7 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, class MEO(PRM): required_inputs = ['sources', 'targets', 'edges'] + doi = ["10.1093/nar/gkq1207"] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 84105bdaf..b7f33bf3b 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -24,6 +24,7 @@ """ class MinCostFlow (PRM): required_inputs = ['sources', 'targets', 'edges'] + doi = ["10.1038/s41540-020-00167-1"] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index f858d46fc..42dfde9aa 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -50,6 +50,7 @@ class OmicsIntegrator1(PRM): """ required_inputs = ['prizes', 'edges', 'dummy_nodes'] + doi = ["10.1371/journal.pcbi.1004879"] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/pathlinker.py b/spras/pathlinker.py index dde8b9c5c..03e771d06 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -24,6 +24,7 @@ """ class PathLinker(PRM): required_inputs = ['nodetypes', 'network'] + doi = ["10.1038/npjsba.2016.2"] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/prm.py b/spras/prm.py index f1dc37231..bb0a17c87 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +import typing from spras.dataset import Dataset @@ -10,13 +11,20 @@ class PRM(ABC): algorithms. """ - @property - @staticmethod - @abstractmethod - def required_inputs(self): - # Note: This NotImplementedError will never trigger. - # See CONTRIBUTING.md for more information. - raise NotImplementedError + required_inputs: list[str] = [] + # DOIs aren't strictly required (e.g. local neighborhood), + # but it should be explicitly declared that there are no DOIs. + doi: list[str] = typing.cast(list[str], None) + + def __init_subclass__(cls): + # modified from https://stackoverflow.com/a/58206480/7589775 + props = ["required_inputs", "dois"] + for prop in props: + if getattr(PRM, prop) is getattr(cls, prop): + raise NotImplementedError( + "Attribute '{}' has not been overriden in class '{}'" \ + .format(prop, cls.__name__) + ) @staticmethod @abstractmethod From 543915f6c554a33bd190db31e3482f05b09c3f22 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 23 Jun 2025 20:50:30 +0000 Subject: [PATCH 02/60] fix: correct prop names --- spras/allpairs.py | 2 +- spras/domino.py | 2 +- spras/meo.py | 2 +- spras/mincostflow.py | 2 +- spras/omicsintegrator1.py | 2 +- spras/omicsintegrator2.py | 1 + spras/pathlinker.py | 2 +- spras/prm.py | 2 +- 8 files changed, 8 insertions(+), 7 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index ea0ca5821..3b79f5ef2 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -12,7 +12,7 @@ class AllPairs(PRM): required_inputs = ['nodetypes', 'network'] - doi = [] + dois = [] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/domino.py b/spras/domino.py index 19f4b591e..f9890a7d4 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -28,7 +28,7 @@ """ class DOMINO(PRM): required_inputs = ['network', 'active_genes'] - doi = ["10.15252/msb.20209593"] + dois = ["10.15252/msb.20209593"] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/meo.py b/spras/meo.py index ae97b6dec..c94d57b80 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -84,7 +84,7 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, class MEO(PRM): required_inputs = ['sources', 'targets', 'edges'] - doi = ["10.1093/nar/gkq1207"] + dois = ["10.1093/nar/gkq1207"] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/mincostflow.py b/spras/mincostflow.py index b7f33bf3b..1c64d2aa1 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -24,7 +24,7 @@ """ class MinCostFlow (PRM): required_inputs = ['sources', 'targets', 'edges'] - doi = ["10.1038/s41540-020-00167-1"] + dois = ["10.1038/s41540-020-00167-1"] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 42dfde9aa..976664d82 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -50,7 +50,7 @@ class OmicsIntegrator1(PRM): """ required_inputs = ['prizes', 'edges', 'dummy_nodes'] - doi = ["10.1371/journal.pcbi.1004879"] + dois = ["10.1371/journal.pcbi.1004879"] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index b631da90f..26357f4fd 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -22,6 +22,7 @@ """ class OmicsIntegrator2(PRM): required_inputs = ['prizes', 'edges'] + dois = [] def generate_inputs(data: Dataset, filename_map): """ diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 03e771d06..dce31d9fe 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -24,7 +24,7 @@ """ class PathLinker(PRM): required_inputs = ['nodetypes', 'network'] - doi = ["10.1038/npjsba.2016.2"] + dois = ["10.1038/npjsba.2016.2"] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/prm.py b/spras/prm.py index bb0a17c87..ca004f5b6 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -14,7 +14,7 @@ class PRM(ABC): required_inputs: list[str] = [] # DOIs aren't strictly required (e.g. local neighborhood), # but it should be explicitly declared that there are no DOIs. - doi: list[str] = typing.cast(list[str], None) + dois: list[str] = typing.cast(list[str], None) def __init_subclass__(cls): # modified from https://stackoverflow.com/a/58206480/7589775 From 352ba56229fbbd3979583d0c36581b53663f140d Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 23 Jun 2025 20:53:58 +0000 Subject: [PATCH 03/60] style: fmt --- spras/prm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/prm.py b/spras/prm.py index ca004f5b6..c3a16277d 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,5 +1,5 @@ -from abc import ABC, abstractmethod import typing +from abc import ABC, abstractmethod from spras.dataset import Dataset From 2fdb13cd2e801efe8a95f8832de9aeb6b27e8977 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 23 Jun 2025 15:31:55 -0700 Subject: [PATCH 04/60] chore: add second doi in pl --- spras/pathlinker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/pathlinker.py b/spras/pathlinker.py index dce31d9fe..fc94d9818 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -24,7 +24,7 @@ """ class PathLinker(PRM): required_inputs = ['nodetypes', 'network'] - dois = ["10.1038/npjsba.2016.2"] + dois = ["10.1038/npjsba.2016.2", "10.1089/cmb.2012.0274"] @staticmethod def generate_inputs(data, filename_map): From 49fd4beea68563570132afaff06e2aaa6e17e95d Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 24 Jun 2025 17:13:22 +0000 Subject: [PATCH 05/60] refactor: don't use globals in runner makes 'unused variable' warnings in runner.py meaningful - this bit @AMINOexe when running into an 'algorithm is not supported' error despite having the class prepared. --- spras/runner.py | 50 ++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/spras/runner.py b/spras/runner.py index 8490644c1..632cf2531 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,22 +1,35 @@ # supported algorithm imports -from spras.allpairs import AllPairs as allpairs +from spras.allpairs import AllPairs from spras.dataset import Dataset -from spras.domino import DOMINO as domino -from spras.meo import MEO as meo -from spras.mincostflow import MinCostFlow as mincostflow -from spras.omicsintegrator1 import OmicsIntegrator1 as omicsintegrator1 -from spras.omicsintegrator2 import OmicsIntegrator2 as omicsintegrator2 -from spras.pathlinker import PathLinker as pathlinker +from spras.domino import DOMINO +from spras.meo import MEO +from spras.mincostflow import MinCostFlow +from spras.omicsintegrator1 import OmicsIntegrator1 +from spras.omicsintegrator2 import OmicsIntegrator2 +from spras.pathlinker import PathLinker +from spras.prm import PRM +algorithms: dict[str, type[PRM]] = { + "allpairs": AllPairs, + "domino": DOMINO, + "meo": MEO, + "mincostflow": MinCostFlow, + "omicsintegrator1": OmicsIntegrator1, + "omicsintegrator2": OmicsIntegrator2, + "pathlinker": PathLinker, +} + +def get_algorithm(algorithm: str) -> type[PRM]: + try: + return algorithms[algorithm.lower()] + except KeyError as exc: + raise NotImplementedError(f'{algorithm} is not currently supported.') from exc def run(algorithm: str, params): """ A generic interface to the algorithm-specific run functions """ - try: - algorithm_runner = globals()[algorithm.lower()] - except KeyError as exc: - raise NotImplementedError(f'{algorithm} is not currently supported') from exc + algorithm_runner = get_algorithm(algorithm) algorithm_runner.run(**params) @@ -26,10 +39,7 @@ def get_required_inputs(algorithm: str): @param algorithm: algorithm name @return: A list of strings of input files types """ - try: - algorithm_runner = globals()[algorithm.lower()] - except KeyError as exc: - raise NotImplementedError(f'{algorithm} is not currently supported') from exc + algorithm_runner = get_algorithm(algorithm) return algorithm_runner.required_inputs @@ -52,10 +62,7 @@ def prepare_inputs(algorithm: str, data_file: str, filename_map: dict[str, str]) @return: """ dataset = Dataset.from_file(data_file) - try: - algorithm_runner = globals()[algorithm.lower()] - except KeyError as exc: - raise NotImplementedError(f'{algorithm} is not currently supported') from exc + algorithm_runner = get_algorithm(algorithm) return algorithm_runner.generate_inputs(dataset, filename_map) @@ -66,8 +73,5 @@ def parse_output(algorithm: str, raw_pathway_file: str, standardized_pathway_fil @param raw_pathway_file: pathway file produced by an algorithm's run function @param standardized_pathway_file: the same pathway written in the universal format """ - try: - algorithm_runner = globals()[algorithm.lower()] - except KeyError as exc: - raise NotImplementedError(f'{algorithm} is not currently supported') from exc + algorithm_runner = get_algorithm(algorithm) return algorithm_runner.parse_output(raw_pathway_file, standardized_pathway_file) From c2e64f7eb0c6fbe0ab15b1e8f0707a2c63bbec47 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 12:47:04 -0700 Subject: [PATCH 06/60] feat: begin config refactor --- config/config.yaml | 6 +- config/egfr-param-tuning.yaml | 1 - config/egfr.yaml | 1 - docker-wrappers/SPRAS/example_config.yaml | 2 - environment.yml | 1 + pyproject.toml | 1 + spras/{ => config}/config.py | 169 +++++++++++----------- spras/config/raw_config.py | 64 ++++++++ spras/containers.py | 2 +- test/AllPairs/test_ap.py | 2 +- test/DOMINO/test_domino.py | 2 +- test/LocalNeighborhood/test_ln.py | 2 +- test/MEO/test_meo.py | 2 +- test/MinCostFlow/test_mcf.py | 2 +- test/OmicsIntegrator1/test_oi1.py | 2 +- test/OmicsIntegrator2/test_oi2.py | 2 +- test/PathLinker/test_pathlinker.py | 2 +- test/analysis/input/config.yaml | 1 - test/analysis/input/egfr.yaml | 1 - test/analysis/test_cytoscape.py | 2 +- test/analysis/test_summary.py | 2 +- test/test_config.py | 2 +- test/test_util.py | 2 +- 23 files changed, 169 insertions(+), 104 deletions(-) rename spras/{ => config}/config.py (80%) create mode 100644 spras/config/raw_config.py diff --git a/config/config.yaml b/config/config.yaml index 1f246dd15..3179dfedc 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -63,6 +63,11 @@ algorithms: - name: "omicsintegrator2" params: include: true + runs: + - b: 4 + g: 0 + - b: 2 + g: 3 run1: b: 4 g: 0 @@ -144,7 +149,6 @@ reconstruction_settings: # TODO move to global reconstruction_dir: "output" - run: true analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset diff --git a/config/egfr-param-tuning.yaml b/config/egfr-param-tuning.yaml index a0a965b70..50a4788a2 100644 --- a/config/egfr-param-tuning.yaml +++ b/config/egfr-param-tuning.yaml @@ -3440,7 +3440,6 @@ gold_standards: reconstruction_settings: locations: reconstruction_dir: output/tps_egfr - run: true analysis: summary: include: true diff --git a/config/egfr.yaml b/config/egfr.yaml index b8c5138b8..cea3ad54b 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -74,7 +74,6 @@ datasets: reconstruction_settings: locations: reconstruction_dir: output/egfr - run: true analysis: graphspace: include: false diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml index f7fd74e98..9791c2f23 100644 --- a/docker-wrappers/SPRAS/example_config.yaml +++ b/docker-wrappers/SPRAS/example_config.yaml @@ -123,8 +123,6 @@ reconstruction_settings: # TODO move to global reconstruction_dir: "output" - run: true - analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset summary: diff --git a/environment.yml b/environment.yml index 6694b9812..7d14e3ea4 100644 --- a/environment.yml +++ b/environment.yml @@ -8,6 +8,7 @@ dependencies: - matplotlib=3.6 - networkx=2.8 - pandas=1.5 + - pydantic=2.11.7 - numpy=1.26.4 - pre-commit=2.20 # Only required for development - pytest=8.0 # Only required for development diff --git a/pyproject.toml b/pyproject.toml index 3e90f7b1e..27dee2693 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "matplotlib==3.6", "networkx==2.8", "pandas==1.5", + "pydantic==2.11.7", "numpy==1.26.4", "pip==22.1", "requests==2.28", diff --git a/spras/config.py b/spras/config/config.py similarity index 80% rename from spras/config.py rename to spras/config/config.py index 7bbf9cd1b..f6c5ada8b 100644 --- a/spras/config.py +++ b/spras/config/config.py @@ -16,19 +16,19 @@ import itertools as it import os import re +import warnings from collections.abc import Iterable import numpy as np import yaml from spras.util import NpHashEncoder, hash_params_sha1_base32 - -# The default length of the truncated hash used to identify parameter combinations -DEFAULT_HASH_LENGTH = 7 -DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" +from spras.config.raw_config import ContainerFramework, RawConfig, DEFAULT_HASH_LENGTH config = None +DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" + # This will get called in the Snakefile, instantiating the singleton with the raw config def init_global(config_dict): global config @@ -43,11 +43,9 @@ def init_from_file(filepath): with open(filepath, 'r') as yaml_file: config_dict = yaml.safe_load(yaml_file) except FileNotFoundError: - print(f"Error: The specified config '{filepath}' could not be found.") - return False + raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") except yaml.YAMLError as e: - print(f"Error: Failed to parse config '{filepath}': {e}") - return False + raise RuntimeError(f"Error: Failed to parse config '{filepath}': {e}") # And finally, initialize config = Config(config_dict) @@ -55,18 +53,15 @@ def init_from_file(filepath): class Config: def __init__(self, raw_config): - # Since process_config winds up modifying the raw_config passed to it as a side effect, - # we'll make a deep copy here to guarantee we don't break anything. This preserves the - # config as it's given to the Snakefile by Snakemake + # Member vars populated by process_config. Any values that don't have sensible initial values are set to None + # before they are populated for __init__ to show exactly what is being configured. - # Member vars populated by process_config. Set to None before they are populated so that our - # __init__ makes clear exactly what is being configured. # Directory used for storing output self.out_dir = None # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity" self.container_framework = None # The container prefix (host and organization) to use for images. Default is "docker.io/reedcompbio" - self.container_prefix = DEFAULT_CONTAINER_PREFIX + self.container_prefix: str = DEFAULT_CONTAINER_PREFIX # A Boolean specifying whether to unpack singularity containers. Default is False self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run @@ -74,7 +69,7 @@ def __init__(self, raw_config): # A dictionary to store configured gold standard data against output of SPRAS runs self.gold_standards = None # The hash length SPRAS will use to identify parameter combinations. Default is 7 - self.hash_length = DEFAULT_HASH_LENGTH + self.hash_length: int = DEFAULT_HASH_LENGTH # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key. self.algorithms = None # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. @@ -107,44 +102,24 @@ def __init__(self, raw_config): # A Boolean specifying whether to run the evaluation per algorithm analysis self.analysis_include_evaluation_aggregate_algo = None - _raw_config = copy.deepcopy(raw_config) - self.process_config(_raw_config) - - def process_config(self, raw_config): + # Since snakemake provides an empty config, we provide this + # wrapper error first before passing validation to pydantic. if raw_config == {}: raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") - # Set up a few top-level config variables - self.out_dir = raw_config["reconstruction_settings"]["locations"]["reconstruction_dir"] - - # We allow the container framework not to be defined in the config. In the case it isn't, default to docker. - # However, if we get a bad value, we raise an exception. - if "container_framework" in raw_config: - container_framework = raw_config["container_framework"].lower() - if container_framework not in ("docker", "singularity", "dsub"): - msg = "SPRAS was configured to run with an unknown container framework: '" + raw_config["container_framework"] + "'. Accepted values are 'docker', 'singularity' or 'dsub'." - raise ValueError(msg) - if container_framework == "dsub": - print("Warning: 'dsub' framework integration is experimental and may not be fully supported.") - self.container_framework = container_framework - else: - self.container_framework = "docker" - - # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. - if "unpack_singularity" in raw_config: - # The value in the config is a string, and we need to convert it to a bool. - unpack_singularity = raw_config["unpack_singularity"] - if unpack_singularity and self.container_framework != "singularity": - print("Warning: unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") - self.unpack_singularity = unpack_singularity - - # Grab registry from the config, and if none is provided default to docker - if "container_registry" in raw_config and raw_config["container_registry"]["base_url"] != "" and raw_config["container_registry"]["owner"] != "": - self.container_prefix = raw_config["container_registry"]["base_url"] + "/" + raw_config["container_registry"]["owner"] - - # Parse dataset information - # Datasets is initially a list, where each list entry has a dataset label and lists of input files - # Convert the dataset list into a dict where the label is the key and update the config data structure + # Since process_config winds up modifying the raw_config passed to it as a side effect, + # we'll make a deep copy here to guarantee we don't break anything. This preserves the + # config as it's given to the Snakefile by Snakemake + _raw_config = copy.deepcopy(raw_config) + parsed_raw_config = RawConfig.model_validate_json(_raw_config) + self.process_config(parsed_raw_config) + + def process_datasets(self, raw_config: RawConfig): + """ + Parse dataset information + Datasets is initially a list, where each list entry has a dataset label and lists of input files + Convert the dataset list into a dict where the label is the key and update the config data structure + """ # TODO allow labels to be optional and assign default labels # TODO check for collisions in dataset labels, warn, and make the labels unique # Need to work more on input file naming to make less strict assumptions @@ -152,24 +127,20 @@ def process_config(self, raw_config): # Currently assumes all datasets have a label and the labels are unique # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts # Convert to dicts to simplify the yaml logging - self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} - - for key in self.datasets: - pattern = r'^\w+$' - if not bool(re.match(pattern, key)): - raise ValueError(f"Dataset label \'{key}\' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.") + self.datasets = {} + for dataset in raw_config.datasets: + label = dataset.label + if label in self.datasets: + raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.") + self.datasets[label] = dict(dataset) + + # Validate dataset labels + label_pattern = r'^\w+$' + if not bool(re.match(label_pattern, label)): + raise ValueError(f"Dataset label '{label}' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.") # parse gold standard information - try: - self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standards"]} - except: - self.gold_standards = {} - - # check that gold_standard labels are formatted correctly - for key in self.gold_standards: - pattern = r'^\w+$' - if not bool(re.match(pattern, key)): - raise ValueError(f"Gold standard label \'{key}\' contains invalid values. Gold standard labels can only contain letters, numbers, or underscores.") + self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards} # check that all the dataset labels in the gold standards are existing datasets labels dataset_labels = set(self.datasets.keys()) @@ -182,33 +153,30 @@ def process_config(self, raw_config): # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] # Maps from the dataset label to the dataset list index # dataset_dict = {dataset.get('label', f'dataset{index}'): index for index, dataset in enumerate(datasets)} - - # Override the default parameter hash length if specified in the config file - if "hash_length" in raw_config and raw_config["hash_length"] != "": - self.hash_length = int(raw_config["hash_length"]) - + + def process_algorithms(self, raw_config: RawConfig): + """ + Parse algorithm information + Each algorithm's parameters are provided as a list of dictionaries + Defaults are handled in the Python function or class that wraps + running that algorithm + Keys in the parameter dictionary are strings + """ prior_params_hashes = set() - - # Parse algorithm information - # Each algorithm's parameters are provided as a list of dictionaries - # Defaults are handled in the Python function or class that wraps - # running that algorithm - # Keys in the parameter dictionary are strings self.algorithm_params = dict() self.algorithm_directed = dict() - self.algorithms = raw_config["algorithms"] + self.algorithms = raw_config.algorithms for alg in self.algorithms: - cur_params = alg["params"] - if "include" in cur_params and cur_params.pop("include"): + cur_params = alg.params + if cur_params.include: # This dict maps from parameter combinations hashes to parameter combination dictionaries - self.algorithm_params[alg["name"]] = dict() + self.algorithm_params[alg.name] = dict() else: # Do not parse the rest of the parameters for this algorithm if it is not included continue - if "directed" in cur_params: - print("UPDATE: we no longer use the directed key in the config file") - cur_params.pop("directed") + if cur_params.directed != None: + warnings.warn("UPDATE: we no longer use the directed key in the config file") # The algorithm has no named arguments so create a default placeholder if len(cur_params) == 0: @@ -265,6 +233,39 @@ def process_config(self, raw_config): f'(current length {self.hash_length}).') self.algorithm_params[alg["name"]][params_hash] = run_dict + def process_config(self, raw_config: RawConfig): + # Set up a few top-level config variables + self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir + + # We allow the container framework not to be defined in the config. In the case it isn't, default to docker. + # However, if we get a bad value, we raise an exception. + if raw_config.container_framework != None: + container_framework = raw_config.container_framework + if container_framework == ContainerFramework.dsub: + warnings.warn("'dsub' framework integration is experimental and may not be fully supported.") + self.container_framework = container_framework + else: + self.container_framework = "docker" + + # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. + if raw_config.unpack_singularity: + # The value in the config is a string, and we need to convert it to a bool. + unpack_singularity = raw_config["unpack_singularity"] + if unpack_singularity and self.container_framework != "singularity": + warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") + self.unpack_singularity = unpack_singularity + + # Grab registry from the config, and if none is provided default to docker + if raw_config.container_registry and raw_config["container_registry"]["base_url"] != "" and raw_config["container_registry"]["owner"] != "": + self.container_prefix = raw_config["container_registry"]["base_url"] + "/" + raw_config["container_registry"]["owner"] + + # Override the default parameter hash length if specified in the config file + if "hash_length" in raw_config and raw_config["hash_length"] != "": + self.hash_length = int(raw_config["hash_length"]) + + self.process_datasets(raw_config) + self.process_algorithms(raw_config) + self.analysis_params = raw_config["analysis"] if "analysis" in raw_config else {} self.ml_params = self.analysis_params["ml"] if "ml" in self.analysis_params else {} self.evaluation_params = self.analysis_params["evaluation"] if "evaluation" in self.analysis_params else {} diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py new file mode 100644 index 000000000..1810bf0e5 --- /dev/null +++ b/spras/config/raw_config.py @@ -0,0 +1,64 @@ +""" +Contains the raw pydantic schema for the configuration file. +""" + +from enum import Enum +from pydantic import BaseModel, ConfigDict, Field +from typing import Optional + +# The default length of the truncated hash used to identify parameter combinations +DEFAULT_HASH_LENGTH = 7 + +class ContainerFramework(str, Enum): + docker = 'docker' + # TODO: add apptainer variant once #260 gets merged + singularity = 'singularity' + dsub = 'dsub' + +class ContainerRegistry(BaseModel): + base_url: str + owner: str = Field(description="The owner or project of the registry") + +class AlgorithmParams(BaseModel): + include: bool = Field(default=False) + directed: Optional[bool] + # TODO + +class Algorithm(BaseModel): + name: str + params: AlgorithmParams + +class Dataset(BaseModel): + label: str + node_files: list[str] + edge_files: list[str] + other_files: list[str] + data_dir: str + +class GoldStandard(BaseModel): + label: str + node_files: list[str] + data_dir: str + dataset_labels: list[str] + +class Locations(BaseModel): + reconstruction_dir: str + +class ReconstructionSettings(BaseModel): + locations: Locations + +class RawConfig(BaseModel): + # TODO: move this to nested container key + container_framework: Optional[ContainerFramework] + unpack_singularity: bool = Field(default=False) + container_registry: ContainerRegistry + + hash_length: Optional[int] = Field( + description="The length of the hash used to identify a parameter combination", + default=DEFAULT_HASH_LENGTH) + + algorithms: list[Algorithm] + datasets: list[Dataset] + gold_standards: list[GoldStandard] = Field(default=[]) + + reconstruction_settings: ReconstructionSettings diff --git a/spras/containers.py b/spras/containers.py index a1fda05f2..3e6c7c3fc 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -7,7 +7,7 @@ import docker -import spras.config as config +import spras.config.config as config from spras.logging import indent from spras.util import hash_filename diff --git a/test/AllPairs/test_ap.py b/test/AllPairs/test_ap.py index 442b26a73..31dd612d9 100644 --- a/test/AllPairs/test_ap.py +++ b/test/AllPairs/test_ap.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.allpairs import AllPairs # Note that we don't directly use the config in the test, but we need the config diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index 7f09fa975..4323ea4c9 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.domino import DOMINO, post_domino_id_transform, pre_domino_id_transform config.init_from_file("config/config.yaml") diff --git a/test/LocalNeighborhood/test_ln.py b/test/LocalNeighborhood/test_ln.py index fbee54902..9093efc68 100644 --- a/test/LocalNeighborhood/test_ln.py +++ b/test/LocalNeighborhood/test_ln.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config config.init_from_file("config/config.yaml") diff --git a/test/MEO/test_meo.py b/test/MEO/test_meo.py index e2abdb72d..32958be20 100644 --- a/test/MEO/test_meo.py +++ b/test/MEO/test_meo.py @@ -3,7 +3,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.meo import MEO, write_properties config.init_from_file("config/config.yaml") diff --git a/test/MinCostFlow/test_mcf.py b/test/MinCostFlow/test_mcf.py index 89bd61d0b..c777a665d 100644 --- a/test/MinCostFlow/test_mcf.py +++ b/test/MinCostFlow/test_mcf.py @@ -3,7 +3,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.mincostflow import MinCostFlow config.init_from_file("config/config.yaml") diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index 35b41d428..a484c0af3 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -3,7 +3,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.omicsintegrator1 import OmicsIntegrator1, write_conf config.init_from_file("config/config.yaml") diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index 2a0a3e3c1..13f7f30b6 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -3,7 +3,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.omicsintegrator2 import OmicsIntegrator2 config.init_from_file("config/config.yaml") diff --git a/test/PathLinker/test_pathlinker.py b/test/PathLinker/test_pathlinker.py index 3fd6a96bd..ed9f10670 100644 --- a/test/PathLinker/test_pathlinker.py +++ b/test/PathLinker/test_pathlinker.py @@ -3,7 +3,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.pathlinker import PathLinker config.init_from_file("config/config.yaml") diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml index 833e6c4bb..49879e461 100644 --- a/test/analysis/input/config.yaml +++ b/test/analysis/input/config.yaml @@ -102,7 +102,6 @@ reconstruction_settings: locations: #place the save path here reconstruction_dir: "output" - run: true analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml index 1ddac1cae..281ecb495 100644 --- a/test/analysis/input/egfr.yaml +++ b/test/analysis/input/egfr.yaml @@ -91,7 +91,6 @@ datasets: reconstruction_settings: locations: reconstruction_dir: output/egfr - run: true analysis: graphspace: include: true diff --git a/test/analysis/test_cytoscape.py b/test/analysis/test_cytoscape.py index 7451b9876..68a77cd07 100644 --- a/test/analysis/test_cytoscape.py +++ b/test/analysis/test_cytoscape.py @@ -2,7 +2,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.analysis.cytoscape import run_cytoscape config.init_from_file("test/analysis/input/config.yaml") diff --git a/test/analysis/test_summary.py b/test/analysis/test_summary.py index 4ff5396da..0400d1f1b 100644 --- a/test/analysis/test_summary.py +++ b/test/analysis/test_summary.py @@ -3,7 +3,7 @@ import pandas as pd -import spras.config as config +import spras.config.config as config from spras.analysis.summary import summarize_networks from spras.dataset import Dataset diff --git a/test/test_config.py b/test/test_config.py index 6c773ddc0..26b18a4e9 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -3,7 +3,7 @@ import numpy as np import pytest -import spras.config as config +import spras.config.config as config # Set up a dummy config for testing. For now, only include things that MUST exist in the dict diff --git a/test/test_util.py b/test/test_util.py index baf9db0ed..2a25fc0d1 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -2,7 +2,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.containers import convert_docker_path, prepare_path_docker, prepare_volume from spras.util import hash_params_sha1_base32 From 4d1a19c54e22052d8e82f9c10bfdc791e4df5143 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 20:49:36 +0000 Subject: [PATCH 07/60] feat: mostly structured config --- spras/config/__init__.py | 0 spras/config/config.py | 113 +++++++++++++--------------- spras/config/raw_config.py | 27 ++++++- spras/config/raw_config_analysis.py | 33 ++++++++ 4 files changed, 109 insertions(+), 64 deletions(-) create mode 100644 spras/config/__init__.py create mode 100644 spras/config/raw_config_analysis.py diff --git a/spras/config/__init__.py b/spras/config/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spras/config/config.py b/spras/config/config.py index f6c5ada8b..751c774a5 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -18,12 +18,13 @@ import re import warnings from collections.abc import Iterable +from typing import Any import numpy as np import yaml from spras.util import NpHashEncoder, hash_params_sha1_base32 -from spras.config.raw_config import ContainerFramework, RawConfig, DEFAULT_HASH_LENGTH +from spras.config.raw_config import ContainerFramework, RawConfig config = None @@ -52,12 +53,15 @@ def init_from_file(filepath): class Config: - def __init__(self, raw_config): - # Member vars populated by process_config. Any values that don't have sensible initial values are set to None + def __init__(self, raw_config: dict[str, Any]): + parsed_raw_config = RawConfig.model_validate(raw_config) + self.process_config(parsed_raw_config) + + # Member vars populated by process_config. Any values that don't have quick initial values are set to None # before they are populated for __init__ to show exactly what is being configured. # Directory used for storing output - self.out_dir = None + self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity" self.container_framework = None # The container prefix (host and organization) to use for images. Default is "docker.io/reedcompbio" @@ -68,8 +72,8 @@ def __init__(self, raw_config): self.datasets = None # A dictionary to store configured gold standard data against output of SPRAS runs self.gold_standards = None - # The hash length SPRAS will use to identify parameter combinations. Default is 7 - self.hash_length: int = DEFAULT_HASH_LENGTH + # The hash length SPRAS will use to identify parameter combinations. + self.hash_length = parsed_raw_config.hash_length # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key. self.algorithms = None # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. @@ -106,13 +110,6 @@ def __init__(self, raw_config): # wrapper error first before passing validation to pydantic. if raw_config == {}: raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") - - # Since process_config winds up modifying the raw_config passed to it as a side effect, - # we'll make a deep copy here to guarantee we don't break anything. This preserves the - # config as it's given to the Snakefile by Snakemake - _raw_config = copy.deepcopy(raw_config) - parsed_raw_config = RawConfig.model_validate_json(_raw_config) - self.process_config(parsed_raw_config) def process_datasets(self, raw_config: RawConfig): """ @@ -178,8 +175,12 @@ def process_algorithms(self, raw_config: RawConfig): if cur_params.directed != None: warnings.warn("UPDATE: we no longer use the directed key in the config file") + cur_params = cur_params.__pydantic_extra__ + if not cur_params: + raise RuntimeError("An internal error occured: ConfigDict extra should be set on AlgorithmParams.") + # The algorithm has no named arguments so create a default placeholder - if len(cur_params) == 0: + if len(cur_params.keys()) == 0: cur_params["run1"] = {"spras_placeholder": ["no parameters"]} # Each set of runs should be 1 level down in the config file @@ -210,7 +211,7 @@ def process_algorithms(self, raw_config: RawConfig): # Catch-all for strings obj = [obj] if not isinstance(obj, Iterable): - raise ValueError(f"The object `{obj}` in algorithm {alg['name']} at key '{p}' in run '{run_params}' is not iterable!") from None + raise ValueError(f"The object `{obj}` in algorithm {alg.name} at key '{p}' in run '{run_params}' is not iterable!") from None all_runs.append(obj) run_list_tuples = list(it.product(*all_runs)) param_name_tuple = tuple(param_name_list) @@ -231,47 +232,18 @@ def process_algorithms(self, raw_config: RawConfig): if params_hash in prior_params_hashes: raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file ' f'(current length {self.hash_length}).') - self.algorithm_params[alg["name"]][params_hash] = run_dict - - def process_config(self, raw_config: RawConfig): - # Set up a few top-level config variables - self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir - - # We allow the container framework not to be defined in the config. In the case it isn't, default to docker. - # However, if we get a bad value, we raise an exception. - if raw_config.container_framework != None: - container_framework = raw_config.container_framework - if container_framework == ContainerFramework.dsub: - warnings.warn("'dsub' framework integration is experimental and may not be fully supported.") - self.container_framework = container_framework - else: - self.container_framework = "docker" - - # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. - if raw_config.unpack_singularity: - # The value in the config is a string, and we need to convert it to a bool. - unpack_singularity = raw_config["unpack_singularity"] - if unpack_singularity and self.container_framework != "singularity": - warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") - self.unpack_singularity = unpack_singularity - - # Grab registry from the config, and if none is provided default to docker - if raw_config.container_registry and raw_config["container_registry"]["base_url"] != "" and raw_config["container_registry"]["owner"] != "": - self.container_prefix = raw_config["container_registry"]["base_url"] + "/" + raw_config["container_registry"]["owner"] + self.algorithm_params[alg.name][params_hash] = run_dict - # Override the default parameter hash length if specified in the config file - if "hash_length" in raw_config and raw_config["hash_length"] != "": - self.hash_length = int(raw_config["hash_length"]) + def process_analysis(self, raw_config: RawConfig): + if not raw_config.analysis: + return - self.process_datasets(raw_config) - self.process_algorithms(raw_config) - - self.analysis_params = raw_config["analysis"] if "analysis" in raw_config else {} - self.ml_params = self.analysis_params["ml"] if "ml" in self.analysis_params else {} - self.evaluation_params = self.analysis_params["evaluation"] if "evaluation" in self.analysis_params else {} + self.analysis_params = raw_config.analysis + self.ml_params = self.analysis_params.ml if self.analysis_params.ml else {} + self.evaluation_params = self.analysis_params.evaluation if self.analysis_params.evaluation else {} self.pca_params = {} - if "components" in self.ml_params: + if self.ml_params.components: self.pca_params["components"] = self.ml_params["components"] if "labels" in self.ml_params: self.pca_params["labels"] = self.ml_params["labels"] @@ -282,14 +254,14 @@ def process_config(self, raw_config: RawConfig): if "metric" in self.ml_params: self.hac_params["metric"] = self.ml_params ["metric"] - self.analysis_include_summary = raw_config["analysis"]["summary"]["include"] - self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"] - self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"] - self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] - self.analysis_include_evaluation = raw_config["analysis"]["evaluation"]["include"] + self.analysis_include_summary = raw_config.analysis.summary.include + self.analysis_include_graphspace = raw_config.analysis.graphspace.include + self.analysis_include_cytoscape = raw_config.analysis.cytoscape.include + self.analysis_include_ml = raw_config.analysis.ml.include + self.analysis_include_evaluation = raw_config.analysis.evaluation.include # Only run ML aggregate per algorithm if analysis include ML is set to True - if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml: + if self.ml_params.aggregate_per_algorithm and self.analysis_include_ml: self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] else: self.analysis_include_ml_aggregate_algo = False @@ -304,11 +276,32 @@ def process_config(self, raw_config: RawConfig): self.analysis_include_evaluation = False # Only run Evaluation aggregate per algorithm if analysis include ML is set to True - if 'aggregate_per_algorithm' in self.evaluation_params and self.analysis_include_evaluation: - self.analysis_include_evaluation_aggregate_algo = raw_config["analysis"]["evaluation"]["aggregate_per_algorithm"] + if self.evaluation_params.aggregate_per_algorithm and self.analysis_include_evaluation: + self.analysis_include_evaluation_aggregate_algo = raw_config.analysis.evaluation.aggregate_per_algorithm else: self.analysis_include_evaluation_aggregate_algo = False # Only run Evaluation per algorithm if ML per algorithm is set to True if not self.analysis_include_ml_aggregate_algo: self.analysis_include_evaluation_aggregate_algo = False + + def process_config(self, raw_config: RawConfig): + # Set up a few top-level config variables + self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir + + if raw_config.container_framework == ContainerFramework.dsub: + warnings.warn("'dsub' framework integration is experimental and may not be fully supported.") + self.container_framework = raw_config.container_framework + + # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. + if raw_config.unpack_singularity and self.container_framework != "singularity": + warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") + self.unpack_singularity = raw_config.unpack_singularity + + # Grab registry from the config, and if none is provided default to docker + if raw_config.container_registry and raw_config.container_registry.base_url != "" and raw_config.container_registry.owner != "": + self.container_prefix = raw_config.container_registry.base_url + "/" + raw_config.container_registry.owner + + self.process_datasets(raw_config) + self.process_algorithms(raw_config) + self.process_analysis(raw_config) diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index 1810bf0e5..76992c8f1 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -6,6 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field from typing import Optional +from spras.config.raw_config_analysis import Analysis + # The default length of the truncated hash used to identify parameter combinations DEFAULT_HASH_LENGTH = 7 @@ -19,15 +21,21 @@ class ContainerRegistry(BaseModel): base_url: str owner: str = Field(description="The owner or project of the registry") + model_config = ConfigDict(extra='forbid') + class AlgorithmParams(BaseModel): include: bool = Field(default=False) directed: Optional[bool] - # TODO + + # TODO: use array of runs instead + model_config = ConfigDict(extra='allow') class Algorithm(BaseModel): name: str params: AlgorithmParams + model_config = ConfigDict(extra='forbid') + class Dataset(BaseModel): label: str node_files: list[str] @@ -35,30 +43,41 @@ class Dataset(BaseModel): other_files: list[str] data_dir: str + model_config = ConfigDict(extra='forbid') + class GoldStandard(BaseModel): label: str node_files: list[str] data_dir: str dataset_labels: list[str] + model_config = ConfigDict(extra='forbid') + class Locations(BaseModel): reconstruction_dir: str + model_config = ConfigDict(extra='forbid') + class ReconstructionSettings(BaseModel): locations: Locations + model_config = ConfigDict(extra='forbid') + class RawConfig(BaseModel): - # TODO: move this to nested container key - container_framework: Optional[ContainerFramework] + # TODO: move these container values to a nested container key + container_framework: ContainerFramework = Field(default=ContainerFramework.docker) unpack_singularity: bool = Field(default=False) container_registry: ContainerRegistry - hash_length: Optional[int] = Field( + hash_length: int = Field( description="The length of the hash used to identify a parameter combination", default=DEFAULT_HASH_LENGTH) algorithms: list[Algorithm] datasets: list[Dataset] gold_standards: list[GoldStandard] = Field(default=[]) + analysis: Optional[Analysis] reconstruction_settings: ReconstructionSettings + + model_config = ConfigDict(extra='forbid') diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py new file mode 100644 index 000000000..8743f5969 --- /dev/null +++ b/spras/config/raw_config_analysis.py @@ -0,0 +1,33 @@ +from pydantic import BaseModel +from typing import Optional + +class SummaryAnalysis(BaseModel): + include: bool + +class GraphspaceAnalysis(BaseModel): + include: bool + +class CytoscapeAnalysis(BaseModel): + include: bool + +class MlAnalysis(BaseModel): + include: bool + aggregate_per_algorithm: bool + components: int + labels: bool + # TODO: enumify + linkage: str + # TODO: enumify + metric: str + +class EvaluationAnalysis(BaseModel): + include: bool + aggregate_per_algorithm: bool + + +class Analysis(BaseModel): + summary: Optional[SummaryAnalysis] + graphspace: Optional[GraphspaceAnalysis] + cytoscape: Optional[CytoscapeAnalysis] + ml: Optional[MlAnalysis] + evaluation: Optional[EvaluationAnalysis] From b56ecde361c9138cea993472d1f52c0adbf0dff2 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 20:53:36 +0000 Subject: [PATCH 08/60] feat: add enum variants on ml --- config/config.yaml | 6 ------ spras/config/config.py | 8 ++++---- spras/config/raw_config.py | 3 ++- spras/config/raw_config_analysis.py | 22 +++++++++++++++++----- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 3179dfedc..68c580683 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -63,11 +63,6 @@ algorithms: - name: "omicsintegrator2" params: include: true - runs: - - b: 4 - g: 0 - - b: 2 - g: 3 run1: b: 4 g: 0 @@ -149,7 +144,6 @@ reconstruction_settings: # TODO move to global reconstruction_dir: "output" - analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset summary: diff --git a/spras/config/config.py b/spras/config/config.py index 751c774a5..b6be80ef1 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,8 +23,8 @@ import numpy as np import yaml -from spras.util import NpHashEncoder, hash_params_sha1_base32 from spras.config.raw_config import ContainerFramework, RawConfig +from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None @@ -110,7 +110,7 @@ def __init__(self, raw_config: dict[str, Any]): # wrapper error first before passing validation to pydantic. if raw_config == {}: raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") - + def process_datasets(self, raw_config: RawConfig): """ Parse dataset information @@ -150,7 +150,7 @@ def process_datasets(self, raw_config: RawConfig): # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] # Maps from the dataset label to the dataset list index # dataset_dict = {dataset.get('label', f'dataset{index}'): index for index, dataset in enumerate(datasets)} - + def process_algorithms(self, raw_config: RawConfig): """ Parse algorithm information @@ -172,7 +172,7 @@ def process_algorithms(self, raw_config: RawConfig): # Do not parse the rest of the parameters for this algorithm if it is not included continue - if cur_params.directed != None: + if cur_params.directed is not None: warnings.warn("UPDATE: we no longer use the directed key in the config file") cur_params = cur_params.__pydantic_extra__ diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index 76992c8f1..f60cb47c5 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -3,9 +3,10 @@ """ from enum import Enum -from pydantic import BaseModel, ConfigDict, Field from typing import Optional +from pydantic import BaseModel, ConfigDict, Field + from spras.config.raw_config_analysis import Analysis # The default length of the truncated hash used to identify parameter combinations diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py index 8743f5969..7eae2a127 100644 --- a/spras/config/raw_config_analysis.py +++ b/spras/config/raw_config_analysis.py @@ -1,6 +1,9 @@ -from pydantic import BaseModel +from enum import Enum from typing import Optional +from pydantic import BaseModel + + class SummaryAnalysis(BaseModel): include: bool @@ -10,15 +13,24 @@ class GraphspaceAnalysis(BaseModel): class CytoscapeAnalysis(BaseModel): include: bool +class MlLinkage(str, Enum): + ward = 'ward' + complete = 'complete' + average = 'average' + single = 'single' + +class MlMetric(str, Enum): + euclidean = 'euclidean' + manhattan = 'manhattan' + cosine = 'cosine' + class MlAnalysis(BaseModel): include: bool aggregate_per_algorithm: bool components: int labels: bool - # TODO: enumify - linkage: str - # TODO: enumify - metric: str + linkage: MlLinkage + metric: MlMetric class EvaluationAnalysis(BaseModel): include: bool From bf95888ccbe00d90f124a998ab128f03b324c33f Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 21:06:53 +0000 Subject: [PATCH 09/60] fix: some defaults --- spras/config/config.py | 6 +++--- spras/config/raw_config.py | 12 ++++++------ spras/config/raw_config_analysis.py | 18 +++++++++--------- test/test_config.py | 1 - 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index b6be80ef1..75c9124c6 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -43,10 +43,10 @@ def init_from_file(filepath): try: with open(filepath, 'r') as yaml_file: config_dict = yaml.safe_load(yaml_file) - except FileNotFoundError: - raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") + except FileNotFoundError as e: + raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e except yaml.YAMLError as e: - raise RuntimeError(f"Error: Failed to parse config '{filepath}': {e}") + raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e # And finally, initialize config = Config(config_dict) diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index f60cb47c5..1580706e2 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -25,8 +25,8 @@ class ContainerRegistry(BaseModel): model_config = ConfigDict(extra='forbid') class AlgorithmParams(BaseModel): - include: bool = Field(default=False) - directed: Optional[bool] + include: bool = False + directed: Optional[bool] = None # TODO: use array of runs instead model_config = ConfigDict(extra='allow') @@ -66,8 +66,8 @@ class ReconstructionSettings(BaseModel): class RawConfig(BaseModel): # TODO: move these container values to a nested container key - container_framework: ContainerFramework = Field(default=ContainerFramework.docker) - unpack_singularity: bool = Field(default=False) + container_framework: ContainerFramework = ContainerFramework.docker + unpack_singularity: bool = False container_registry: ContainerRegistry hash_length: int = Field( @@ -76,8 +76,8 @@ class RawConfig(BaseModel): algorithms: list[Algorithm] datasets: list[Dataset] - gold_standards: list[GoldStandard] = Field(default=[]) - analysis: Optional[Analysis] + gold_standards: list[GoldStandard] = [] + analysis: Optional[Analysis] = None reconstruction_settings: ReconstructionSettings diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py index 7eae2a127..5682fb6e6 100644 --- a/spras/config/raw_config_analysis.py +++ b/spras/config/raw_config_analysis.py @@ -27,10 +27,10 @@ class MlMetric(str, Enum): class MlAnalysis(BaseModel): include: bool aggregate_per_algorithm: bool - components: int - labels: bool - linkage: MlLinkage - metric: MlMetric + components: int = 2 + labels: bool = True + linkage: MlLinkage = MlLinkage.ward + metric: MlMetric = MlMetric.euclidean class EvaluationAnalysis(BaseModel): include: bool @@ -38,8 +38,8 @@ class EvaluationAnalysis(BaseModel): class Analysis(BaseModel): - summary: Optional[SummaryAnalysis] - graphspace: Optional[GraphspaceAnalysis] - cytoscape: Optional[CytoscapeAnalysis] - ml: Optional[MlAnalysis] - evaluation: Optional[EvaluationAnalysis] + summary: Optional[SummaryAnalysis] = None + graphspace: Optional[GraphspaceAnalysis] = None + cytoscape: Optional[CytoscapeAnalysis] = None + ml: Optional[MlAnalysis] = None + evaluation: Optional[EvaluationAnalysis] = None diff --git a/test/test_config.py b/test/test_config.py index 26b18a4e9..3039e2dc3 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -25,7 +25,6 @@ def get_test_config(): "datasets": [{"label": "alg1"}, {"label": "alg2"}], "gold_standards": [{"label": "gs1", "dataset_labels": []}], "algorithms": [ - {"params": ["param2", "param2"]}, { "name": "strings", "params": { From 51d6a7b1efeee0b97f5ad6e1ca377f91fbd9ce19 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 21:47:26 +0000 Subject: [PATCH 10/60] feat: fully finish config parsing --- spras/config/config.py | 43 ++++++++++------------------- spras/config/raw_config.py | 23 ++++++++++----- spras/config/raw_config_analysis.py | 21 +++++++------- spras/config/util_enum.py | 14 ++++++++++ test/test_config.py | 33 +++++++++++++++++++--- 5 files changed, 83 insertions(+), 51 deletions(-) create mode 100644 spras/config/util_enum.py diff --git a/spras/config/config.py b/spras/config/config.py index 75c9124c6..02ab61ca0 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -54,8 +54,12 @@ def init_from_file(filepath): class Config: def __init__(self, raw_config: dict[str, Any]): + # Since snakemake provides an empty config, we provide this + # wrapper error first before passing validation to pydantic. + if raw_config == {}: + raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") + parsed_raw_config = RawConfig.model_validate(raw_config) - self.process_config(parsed_raw_config) # Member vars populated by process_config. Any values that don't have quick initial values are set to None # before they are populated for __init__ to show exactly what is being configured. @@ -106,10 +110,7 @@ def __init__(self, raw_config: dict[str, Any]): # A Boolean specifying whether to run the evaluation per algorithm analysis self.analysis_include_evaluation_aggregate_algo = None - # Since snakemake provides an empty config, we provide this - # wrapper error first before passing validation to pydantic. - if raw_config == {}: - raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") + self.process_config(parsed_raw_config) def process_datasets(self, raw_config: RawConfig): """ @@ -118,7 +119,6 @@ def process_datasets(self, raw_config: RawConfig): Convert the dataset list into a dict where the label is the key and update the config data structure """ # TODO allow labels to be optional and assign default labels - # TODO check for collisions in dataset labels, warn, and make the labels unique # Need to work more on input file naming to make less strict assumptions # about the filename structure # Currently assumes all datasets have a label and the labels are unique @@ -130,12 +130,7 @@ def process_datasets(self, raw_config: RawConfig): if label in self.datasets: raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.") self.datasets[label] = dict(dataset) - - # Validate dataset labels - label_pattern = r'^\w+$' - if not bool(re.match(label_pattern, label)): - raise ValueError(f"Dataset label '{label}' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.") - + # parse gold standard information self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards} @@ -173,7 +168,7 @@ def process_algorithms(self, raw_config: RawConfig): continue if cur_params.directed is not None: - warnings.warn("UPDATE: we no longer use the directed key in the config file") + warnings.warn("UPDATE: we no longer use the directed key in the config file", stacklevel=2) cur_params = cur_params.__pydantic_extra__ if not cur_params: @@ -239,20 +234,10 @@ def process_analysis(self, raw_config: RawConfig): return self.analysis_params = raw_config.analysis - self.ml_params = self.analysis_params.ml if self.analysis_params.ml else {} - self.evaluation_params = self.analysis_params.evaluation if self.analysis_params.evaluation else {} - - self.pca_params = {} - if self.ml_params.components: - self.pca_params["components"] = self.ml_params["components"] - if "labels" in self.ml_params: - self.pca_params["labels"] = self.ml_params["labels"] + self.ml_params = self.analysis_params.ml + self.evaluation_params = self.analysis_params.evaluation - self.hac_params = {} - if "linkage" in self.ml_params: - self.hac_params["linkage"] = self.ml_params["linkage"] - if "metric" in self.ml_params: - self.hac_params["metric"] = self.ml_params ["metric"] + self.pca_params = self.ml_params self.analysis_include_summary = raw_config.analysis.summary.include self.analysis_include_graphspace = raw_config.analysis.graphspace.include @@ -262,7 +247,7 @@ def process_analysis(self, raw_config: RawConfig): # Only run ML aggregate per algorithm if analysis include ML is set to True if self.ml_params.aggregate_per_algorithm and self.analysis_include_ml: - self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] + self.analysis_include_ml_aggregate_algo = raw_config.analysis.ml.aggregate_per_algorithm else: self.analysis_include_ml_aggregate_algo = False @@ -290,12 +275,12 @@ def process_config(self, raw_config: RawConfig): self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir if raw_config.container_framework == ContainerFramework.dsub: - warnings.warn("'dsub' framework integration is experimental and may not be fully supported.") + warnings.warn("'dsub' framework integration is experimental and may not be fully supported.", stacklevel=2) self.container_framework = raw_config.container_framework # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. if raw_config.unpack_singularity and self.container_framework != "singularity": - warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") + warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.", stacklevel=2) self.unpack_singularity = raw_config.unpack_singularity # Grab registry from the config, and if none is provided default to docker diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index 1580706e2..5ab6fed1a 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -2,17 +2,26 @@ Contains the raw pydantic schema for the configuration file. """ -from enum import Enum -from typing import Optional +import re +from spras.config.util_enum import CaseInsensitiveEnum +from typing import Annotated, Optional -from pydantic import BaseModel, ConfigDict, Field +from pydantic import AfterValidator, BaseModel, ConfigDict, Field from spras.config.raw_config_analysis import Analysis # The default length of the truncated hash used to identify parameter combinations DEFAULT_HASH_LENGTH = 7 -class ContainerFramework(str, Enum): +def label_validator(name: str): + label_pattern = r'^\w+$' + def validate(label: str): + if not bool(re.match(label_pattern, label)): + raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.") + return label + return validate + +class ContainerFramework(CaseInsensitiveEnum): docker = 'docker' # TODO: add apptainer variant once #260 gets merged singularity = 'singularity' @@ -26,7 +35,7 @@ class ContainerRegistry(BaseModel): class AlgorithmParams(BaseModel): include: bool = False - directed: Optional[bool] = None + directed: Annotated[Optional[bool], Field(deprecated=True)] = None # TODO: use array of runs instead model_config = ConfigDict(extra='allow') @@ -38,7 +47,7 @@ class Algorithm(BaseModel): model_config = ConfigDict(extra='forbid') class Dataset(BaseModel): - label: str + label: Annotated[str, AfterValidator(label_validator("Dataset"))] node_files: list[str] edge_files: list[str] other_files: list[str] @@ -47,7 +56,7 @@ class Dataset(BaseModel): model_config = ConfigDict(extra='forbid') class GoldStandard(BaseModel): - label: str + label: Annotated[str, AfterValidator(label_validator("Gold Standard"))] node_files: list[str] data_dir: str dataset_labels: list[str] diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py index 5682fb6e6..6b5be9bab 100644 --- a/spras/config/raw_config_analysis.py +++ b/spras/config/raw_config_analysis.py @@ -1,5 +1,4 @@ -from enum import Enum -from typing import Optional +from spras.config.util_enum import CaseInsensitiveEnum from pydantic import BaseModel @@ -13,20 +12,20 @@ class GraphspaceAnalysis(BaseModel): class CytoscapeAnalysis(BaseModel): include: bool -class MlLinkage(str, Enum): +class MlLinkage(CaseInsensitiveEnum): ward = 'ward' complete = 'complete' average = 'average' single = 'single' -class MlMetric(str, Enum): +class MlMetric(CaseInsensitiveEnum): euclidean = 'euclidean' manhattan = 'manhattan' cosine = 'cosine' class MlAnalysis(BaseModel): include: bool - aggregate_per_algorithm: bool + aggregate_per_algorithm: bool = False components: int = 2 labels: bool = True linkage: MlLinkage = MlLinkage.ward @@ -34,12 +33,12 @@ class MlAnalysis(BaseModel): class EvaluationAnalysis(BaseModel): include: bool - aggregate_per_algorithm: bool + aggregate_per_algorithm: bool = False class Analysis(BaseModel): - summary: Optional[SummaryAnalysis] = None - graphspace: Optional[GraphspaceAnalysis] = None - cytoscape: Optional[CytoscapeAnalysis] = None - ml: Optional[MlAnalysis] = None - evaluation: Optional[EvaluationAnalysis] = None + summary: SummaryAnalysis = SummaryAnalysis(include=False) + graphspace: GraphspaceAnalysis = GraphspaceAnalysis(include=False) + cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) + ml: MlAnalysis = MlAnalysis(include=False) + evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) diff --git a/spras/config/util_enum.py b/spras/config/util_enum.py new file mode 100644 index 000000000..ec5700fd3 --- /dev/null +++ b/spras/config/util_enum.py @@ -0,0 +1,14 @@ +from enum import Enum +from typing import Any + +# https://stackoverflow.com/a/76883868/7589775 +class CaseInsensitiveEnum(str, Enum): + @classmethod + def _missing_(cls, value: Any): + if isinstance(value, str): + value = value.lower() + + for member in cls: + if member.lower() == value: + return member + return None diff --git a/test/test_config.py b/test/test_config.py index 3039e2dc3..9a3bf6549 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -4,7 +4,14 @@ import pytest import spras.config.config as config +from spras.config.raw_config import DEFAULT_HASH_LENGTH +filler_dataset_data: dict[str, str | list[str]] = { + "data_dir": "fake", + "edge_files": [], + "other_files": [], + "node_files": [] +} # Set up a dummy config for testing. For now, only include things that MUST exist in the dict # in order for the config init to complete. To test particular parts of the config initialization, @@ -22,8 +29,25 @@ def get_test_config(): "reconstruction_dir": "my_dir" } }, - "datasets": [{"label": "alg1"}, {"label": "alg2"}], - "gold_standards": [{"label": "gs1", "dataset_labels": []}], + "datasets": [{ + "label": "alg1", + "data_dir": "fake", + "edge_files": [], + "other_files": [], + "node_files": [] + }, { + "label": "alg2", + "data_dir": "faux", + "edge_files": [], + "other_files": [], + "node_files": [] + }], + "gold_standards": [{ + "label": "gs1", + "dataset_labels": [], + "node_files": [], + "data_dir": "gs-fake" + }], "algorithms": [ { "name": "strings", @@ -125,9 +149,9 @@ def test_config_hash_length(self): config.init_global(test_config) assert (config.config.hash_length == 7) - test_config["hash_length"] = "" + test_config.pop("hash_length", None) config.init_global(test_config) - assert (config.config.hash_length == config.DEFAULT_HASH_LENGTH) + assert (config.config.hash_length == DEFAULT_HASH_LENGTH) # Initialize the configuration test_config["hash_length"] = "12" @@ -193,6 +217,7 @@ def test_correct_dataset_label(self): test_config = get_test_config() correct_test_dicts = [{"label": "test"}, {"label": "123"}, {"label": "test123"}, {"label": "123test"}, {"label": "_"}, {"label": "test_test"}, {"label": "_test"}, {"label": "test_"}] + correct_test_dicts = [dict(list(d.items()) + list(filler_dataset_data.items())) for d in correct_test_dicts] for test_dict in correct_test_dicts: test_config["datasets"] = [test_dict] From a27d38decf17c3579753e3b8e7e9b603abc1e538 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 21:49:28 +0000 Subject: [PATCH 11/60] style: fmt --- spras/config/config.py | 2 +- spras/config/raw_config.py | 2 +- spras/config/raw_config_analysis.py | 4 ++-- spras/config/util_enum.py | 1 + 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 02ab61ca0..b627fcc73 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -130,7 +130,7 @@ def process_datasets(self, raw_config: RawConfig): if label in self.datasets: raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.") self.datasets[label] = dict(dataset) - + # parse gold standard information self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards} diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index 5ab6fed1a..4c1cc3581 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -3,12 +3,12 @@ """ import re -from spras.config.util_enum import CaseInsensitiveEnum from typing import Annotated, Optional from pydantic import AfterValidator, BaseModel, ConfigDict, Field from spras.config.raw_config_analysis import Analysis +from spras.config.util_enum import CaseInsensitiveEnum # The default length of the truncated hash used to identify parameter combinations DEFAULT_HASH_LENGTH = 7 diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py index 6b5be9bab..194fc8f7c 100644 --- a/spras/config/raw_config_analysis.py +++ b/spras/config/raw_config_analysis.py @@ -1,7 +1,7 @@ -from spras.config.util_enum import CaseInsensitiveEnum - from pydantic import BaseModel +from spras.config.util_enum import CaseInsensitiveEnum + class SummaryAnalysis(BaseModel): include: bool diff --git a/spras/config/util_enum.py b/spras/config/util_enum.py index ec5700fd3..3e73eda98 100644 --- a/spras/config/util_enum.py +++ b/spras/config/util_enum.py @@ -1,6 +1,7 @@ from enum import Enum from typing import Any + # https://stackoverflow.com/a/76883868/7589775 class CaseInsensitiveEnum(str, Enum): @classmethod From dd4674a22d9af81c210093a8fd9da393074e38db Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 22:03:04 +0000 Subject: [PATCH 12/60] fix: remove dep mark, use strict is None --- spras/config/config.py | 2 +- spras/config/raw_config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index b627fcc73..55d061607 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -171,7 +171,7 @@ def process_algorithms(self, raw_config: RawConfig): warnings.warn("UPDATE: we no longer use the directed key in the config file", stacklevel=2) cur_params = cur_params.__pydantic_extra__ - if not cur_params: + if cur_params is None: raise RuntimeError("An internal error occured: ConfigDict extra should be set on AlgorithmParams.") # The algorithm has no named arguments so create a default placeholder diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index 4c1cc3581..caff7c690 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -35,7 +35,7 @@ class ContainerRegistry(BaseModel): class AlgorithmParams(BaseModel): include: bool = False - directed: Annotated[Optional[bool], Field(deprecated=True)] = None + directed: Optional[bool] = None # TODO: use array of runs instead model_config = ConfigDict(extra='allow') From 5a8826d24434613a611ccdfef69c81260e4d3129 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 23:01:30 +0000 Subject: [PATCH 13/60] chore: correct config loc --- Snakefile | 2 +- spras/config/config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index df90f8e4a..f5817650d 100644 --- a/Snakefile +++ b/Snakefile @@ -5,7 +5,7 @@ import yaml from spras.dataset import Dataset from spras.evaluation import Evaluation from spras.analysis import ml, summary, graphspace, cytoscape -import spras.config as _config +import spras.config.config as _config # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them diff --git a/spras/config/config.py b/spras/config/config.py index 55d061607..5894dd304 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -6,7 +6,7 @@ module that imports this module can access a config option by checking the object's value. For example -import spras.config as config +import spras.config.config as config container_framework = config.config.container_framework will grab the top level registry configuration option as it appears in the config file From a47b0dfbe7128221f2ee378f3188402a83f60ae6 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 23:45:53 +0000 Subject: [PATCH 14/60] fix: specify hac params --- spras/config/config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spras/config/config.py b/spras/config/config.py index 5894dd304..0db7bedf1 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -239,6 +239,11 @@ def process_analysis(self, raw_config: RawConfig): self.pca_params = self.ml_params + self.hac_params = { + "linkage": self.ml_params.linkage, + "metric": self.ml_params.metric + } + self.analysis_include_summary = raw_config.analysis.summary.include self.analysis_include_graphspace = raw_config.analysis.graphspace.include self.analysis_include_cytoscape = raw_config.analysis.cytoscape.include From 8d756045da12d48e603c5d6f7a7846d41d24fd53 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 26 Jun 2025 16:04:27 +0000 Subject: [PATCH 15/60] fix: expand class params --- spras/config/config.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 0db7bedf1..b479c2a53 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -233,9 +233,10 @@ def process_analysis(self, raw_config: RawConfig): if not raw_config.analysis: return - self.analysis_params = raw_config.analysis - self.ml_params = self.analysis_params.ml - self.evaluation_params = self.analysis_params.evaluation + # these params are classes - we need to turn them into var dicts + self.analysis_params = vars(raw_config.analysis) + self.ml_params = vars(self.analysis_params.ml) + self.evaluation_params = vars(self.analysis_params.evaluation) self.pca_params = self.ml_params From afa1de5605fee43de0457aa05c8ee956eb1982df Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 26 Jun 2025 16:06:36 +0000 Subject: [PATCH 16/60] fix: expand on pca_params --- spras/config/config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index b479c2a53..515a677c5 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -233,12 +233,12 @@ def process_analysis(self, raw_config: RawConfig): if not raw_config.analysis: return - # these params are classes - we need to turn them into var dicts - self.analysis_params = vars(raw_config.analysis) - self.ml_params = vars(self.analysis_params.ml) - self.evaluation_params = vars(self.analysis_params.evaluation) + self.analysis_params = raw_config.analysis + self.ml_params = self.analysis_params.ml + self.evaluation_params = self.analysis_params.evaluation - self.pca_params = self.ml_params + # self.ml_params is a class, pca_params needs to be a dict. + self.pca_params = vars(self.ml_params) self.hac_params = { "linkage": self.ml_params.linkage, From 5eefc51f79ad38c8c086f0c0e233fa6f91966d36 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 26 Jun 2025 09:35:20 -0700 Subject: [PATCH 17/60] fix: drop include dict --- spras/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/config.py b/spras/config/config.py index 515a677c5..8d52ebd97 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -238,7 +238,7 @@ def process_analysis(self, raw_config: RawConfig): self.evaluation_params = self.analysis_params.evaluation # self.ml_params is a class, pca_params needs to be a dict. - self.pca_params = vars(self.ml_params) + self.pca_params = {k: v for k, v in vars(self.ml_params).items if k != 'include'} self.hac_params = { "linkage": self.ml_params.linkage, From 52431866627ce081795487c37064403a8045c829 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 26 Jun 2025 10:28:05 -0700 Subject: [PATCH 18/60] fix: call items --- spras/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/config.py b/spras/config/config.py index 8d52ebd97..23d8c44fd 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -238,7 +238,7 @@ def process_analysis(self, raw_config: RawConfig): self.evaluation_params = self.analysis_params.evaluation # self.ml_params is a class, pca_params needs to be a dict. - self.pca_params = {k: v for k, v in vars(self.ml_params).items if k != 'include'} + self.pca_params = {k: v for k, v in vars(self.ml_params).items() if k != 'include'} self.hac_params = { "linkage": self.ml_params.linkage, From 3b20c48bae7ef46a4d15a9b34dc65c399732f95c Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 26 Jun 2025 17:51:54 +0000 Subject: [PATCH 19/60] fix: better typing and deafults --- spras/config/config.py | 17 +++++++++-------- spras/config/raw_config.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 23d8c44fd..d944cc976 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,7 +23,7 @@ import numpy as np import yaml -from spras.config.raw_config import ContainerFramework, RawConfig +from spras.config.raw_config import ContainerFramework, RawConfig, Analysis from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None @@ -86,9 +86,11 @@ def __init__(self, raw_config: dict[str, Any]): # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs. self.algorithm_directed = None # A dict with the analysis settings - self.analysis_params = None + self.analysis_params = parsed_raw_config.analysis + # A dict with the evaluation settings + self.evaluation_params = self.analysis_params.evaluation # A dict with the ML settings - self.ml_params = None + self.ml_params = self.analysis_params.ml # A Boolean specifying whether to run ML analysis for individual algorithms self.analysis_include_ml_aggregate_algo = None # A dict with the PCA settings @@ -233,12 +235,11 @@ def process_analysis(self, raw_config: RawConfig): if not raw_config.analysis: return - self.analysis_params = raw_config.analysis - self.ml_params = self.analysis_params.ml - self.evaluation_params = self.analysis_params.evaluation - # self.ml_params is a class, pca_params needs to be a dict. - self.pca_params = {k: v for k, v in vars(self.ml_params).items() if k != 'include'} + self.pca_params = { + "components": self.ml_params.components, + "labels": self.ml_params.labels + } self.hac_params = { "linkage": self.ml_params.linkage, diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index caff7c690..409b48427 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -86,7 +86,7 @@ class RawConfig(BaseModel): algorithms: list[Algorithm] datasets: list[Dataset] gold_standards: list[GoldStandard] = [] - analysis: Optional[Analysis] = None + analysis: Analysis = Analysis() reconstruction_settings: ReconstructionSettings From 2d4a90f669f77a9d209c658ce5596833a2d43881 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 26 Jun 2025 11:09:54 -0700 Subject: [PATCH 20/60] style: fmt --- spras/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/config.py b/spras/config/config.py index d944cc976..0c176cbab 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,7 +23,7 @@ import numpy as np import yaml -from spras.config.raw_config import ContainerFramework, RawConfig, Analysis +from spras.config.raw_config import Analysis, ContainerFramework, RawConfig from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None From 31ba9d8415f5c0c514d92020331a65bbd6381194 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 1 Jul 2025 14:54:16 -0700 Subject: [PATCH 21/60] docs: mention oi2 paper link --- spras/omicsintegrator2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 26357f4fd..0dfdcc852 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -22,7 +22,8 @@ """ class OmicsIntegrator2(PRM): required_inputs = ['prizes', 'edges'] - dois = [] + # OI2 does not have a specific paper. Instead, we link to the OI1 paper. + dois = ["10.1371/journal.pcbi.1004879"] def generate_inputs(data: Dataset, filename_map): """ From fcbf67385817fd4c7841c004d59e6f624c747240 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 1 Jul 2025 15:03:33 -0700 Subject: [PATCH 22/60] chore: add nodoi to rwr --- spras/rwr.py | 1 + spras/strwr.py | 1 + 2 files changed, 2 insertions(+) diff --git a/spras/rwr.py b/spras/rwr.py index adeccaaed..5c08d6777 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -12,6 +12,7 @@ class RWR(PRM): required_inputs = ['network','nodes'] + dois = [] @staticmethod def generate_inputs(data, filename_map): diff --git a/spras/strwr.py b/spras/strwr.py index dfa1adc2a..fc8536507 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -13,6 +13,7 @@ # Note: This class is almost identical to the rwr.py file. class ST_RWR(PRM): required_inputs = ['network','sources','targets'] + dois = [] @staticmethod def generate_inputs(data, filename_map): From b9352e8dea4be6a0679a250899c25cb104f893b3 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 3 Jul 2025 14:57:29 -0700 Subject: [PATCH 23/60] fix: use correct naming convention for strwr --- spras/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/runner.py b/spras/runner.py index 0e956bc62..1235efc2c 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -24,7 +24,7 @@ "omicsintegrator2": OmicsIntegrator2, "pathlinker": PathLinker, "rwr": RWR, - "st_rwr": ST_RWR, + "strwr": ST_RWR, } def get_algorithm(algorithm: str) -> type[PRM]: From 2a4fb2ec5c923954c9748dc63009002264fcebf9 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Wed, 9 Jul 2025 10:40:44 -0700 Subject: [PATCH 24/60] refactor: add config forbid --- spras/config/raw_config_analysis.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py index 194fc8f7c..dbec5f1b9 100644 --- a/spras/config/raw_config_analysis.py +++ b/spras/config/raw_config_analysis.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from spras.config.util_enum import CaseInsensitiveEnum @@ -6,12 +6,13 @@ class SummaryAnalysis(BaseModel): include: bool -class GraphspaceAnalysis(BaseModel): - include: bool + model_config = ConfigDict(extra='forbid') class CytoscapeAnalysis(BaseModel): include: bool + model_config = ConfigDict(extra='forbid') + class MlLinkage(CaseInsensitiveEnum): ward = 'ward' complete = 'complete' @@ -31,14 +32,18 @@ class MlAnalysis(BaseModel): linkage: MlLinkage = MlLinkage.ward metric: MlMetric = MlMetric.euclidean + model_config = ConfigDict(extra='forbid') + class EvaluationAnalysis(BaseModel): include: bool aggregate_per_algorithm: bool = False + model_config = ConfigDict(extra='forbid') class Analysis(BaseModel): summary: SummaryAnalysis = SummaryAnalysis(include=False) - graphspace: GraphspaceAnalysis = GraphspaceAnalysis(include=False) cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) ml: MlAnalysis = MlAnalysis(include=False) evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) + + model_config = ConfigDict(extra='forbid') From 4ded57eeb490b21a51feec81ed119e400fc363ef Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Wed, 9 Jul 2025 17:45:47 +0000 Subject: [PATCH 25/60] refactor: update config imports --- test/BowTieBuilder/test_btb.py | 2 +- test/RWR/test_RWR.py | 2 +- test/ST_RWR/test_STRWR.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index 88b12d0dd..d4a458b3c 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config config.init_from_file("config/config.yaml") diff --git a/test/RWR/test_RWR.py b/test/RWR/test_RWR.py index 4d6ce7864..b0316ded0 100644 --- a/test/RWR/test_RWR.py +++ b/test/RWR/test_RWR.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.rwr import RWR config.init_from_file("config/config.yaml") diff --git a/test/ST_RWR/test_STRWR.py b/test/ST_RWR/test_STRWR.py index a0a5b4ea9..898b24055 100644 --- a/test/ST_RWR/test_STRWR.py +++ b/test/ST_RWR/test_STRWR.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.strwr import ST_RWR config.init_from_file("config/config.yaml") From 22b568645ef19cde8d430c1bf77a1aff5057ee84 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Wed, 9 Jul 2025 19:23:09 +0000 Subject: [PATCH 26/60] refactor: better names to schema files --- spras/config/{raw_config_analysis.py => analysis_schema.py} | 0 spras/config/config.py | 2 +- spras/config/{raw_config.py => schema.py} | 2 +- test/test_config.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename spras/config/{raw_config_analysis.py => analysis_schema.py} (100%) rename spras/config/{raw_config.py => schema.py} (98%) diff --git a/spras/config/raw_config_analysis.py b/spras/config/analysis_schema.py similarity index 100% rename from spras/config/raw_config_analysis.py rename to spras/config/analysis_schema.py diff --git a/spras/config/config.py b/spras/config/config.py index 2f4b44efa..293a08ec7 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,7 +23,7 @@ import numpy as np import yaml -from spras.config.raw_config import Analysis, ContainerFramework, RawConfig +from spras.config.schema import Analysis, ContainerFramework, RawConfig from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None diff --git a/spras/config/raw_config.py b/spras/config/schema.py similarity index 98% rename from spras/config/raw_config.py rename to spras/config/schema.py index 409b48427..f882e5382 100644 --- a/spras/config/raw_config.py +++ b/spras/config/schema.py @@ -7,7 +7,7 @@ from pydantic import AfterValidator, BaseModel, ConfigDict, Field -from spras.config.raw_config_analysis import Analysis +from spras.config.analysis_schema import Analysis from spras.config.util_enum import CaseInsensitiveEnum # The default length of the truncated hash used to identify parameter combinations diff --git a/test/test_config.py b/test/test_config.py index 84d7d1d54..6095ad145 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.config.raw_config import DEFAULT_HASH_LENGTH +from spras.config.schema import DEFAULT_HASH_LENGTH filler_dataset_data: dict[str, str | list[str]] = { "data_dir": "fake", From 7df701dbc11ab69353e37386c3beb609b14cef69 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 10 Jul 2025 17:14:19 +0000 Subject: [PATCH 27/60] chore: add btb doi --- spras/btb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spras/btb.py b/spras/btb.py index 416395a55..35d33bb72 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -25,6 +25,7 @@ class BowTieBuilder(PRM): required_inputs = ['sources', 'targets', 'edges'] + dois = ["10.1186/1752-0509-3-67"] #generate input taken from meo.py beacuse they have same input requirements @staticmethod From ea59e4cec7d5197fd394f3704a5406facaa6f1a9 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 11 Jul 2025 21:56:42 +0000 Subject: [PATCH 28/60] fix: no default include, mention model_config allow reason --- spras/config/schema.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index f882e5382..d1b86e06d 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -34,10 +34,12 @@ class ContainerRegistry(BaseModel): model_config = ConfigDict(extra='forbid') class AlgorithmParams(BaseModel): - include: bool = False + include: bool directed: Optional[bool] = None - # TODO: use array of runs instead + # TODO: use array of runs instead. We currently rely on the + # extra parameters here to extract the algorithm parameter information, + # which is why this deviates from the usual ConfigDict(extra='forbid'). model_config = ConfigDict(extra='allow') class Algorithm(BaseModel): From fa7d7c984b1e91454926fb0ffa1b51ecdc2377b5 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 11 Jul 2025 15:10:01 -0700 Subject: [PATCH 29/60] fix(config): case-insensitive check on labels --- spras/config/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 293a08ec7..c6ac8f8e0 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -127,8 +127,8 @@ def process_datasets(self, raw_config: RawConfig): self.datasets = {} for dataset in raw_config.datasets: label = dataset.label - if label in self.datasets: - raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.") + if label.lower() in [key.lower() for key in self.datasets.keys()]: + raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.") self.datasets[label] = dict(dataset) # parse gold standard information From 52eab214a9946a58f33ab34847da2119e2be6807 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 16:23:15 +0000 Subject: [PATCH 30/60] refactor: merge config --- spras/config/analysis_schema.py | 49 ----------------------------- spras/config/schema.py | 56 ++++++++++++++++++++++++++++++++- spras/config/util_enum.py | 4 +++ 3 files changed, 59 insertions(+), 50 deletions(-) delete mode 100644 spras/config/analysis_schema.py diff --git a/spras/config/analysis_schema.py b/spras/config/analysis_schema.py deleted file mode 100644 index dbec5f1b9..000000000 --- a/spras/config/analysis_schema.py +++ /dev/null @@ -1,49 +0,0 @@ -from pydantic import BaseModel, ConfigDict - -from spras.config.util_enum import CaseInsensitiveEnum - - -class SummaryAnalysis(BaseModel): - include: bool - - model_config = ConfigDict(extra='forbid') - -class CytoscapeAnalysis(BaseModel): - include: bool - - model_config = ConfigDict(extra='forbid') - -class MlLinkage(CaseInsensitiveEnum): - ward = 'ward' - complete = 'complete' - average = 'average' - single = 'single' - -class MlMetric(CaseInsensitiveEnum): - euclidean = 'euclidean' - manhattan = 'manhattan' - cosine = 'cosine' - -class MlAnalysis(BaseModel): - include: bool - aggregate_per_algorithm: bool = False - components: int = 2 - labels: bool = True - linkage: MlLinkage = MlLinkage.ward - metric: MlMetric = MlMetric.euclidean - - model_config = ConfigDict(extra='forbid') - -class EvaluationAnalysis(BaseModel): - include: bool - aggregate_per_algorithm: bool = False - - model_config = ConfigDict(extra='forbid') - -class Analysis(BaseModel): - summary: SummaryAnalysis = SummaryAnalysis(include=False) - cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) - ml: MlAnalysis = MlAnalysis(include=False) - evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) - - model_config = ConfigDict(extra='forbid') diff --git a/spras/config/schema.py b/spras/config/schema.py index d1b86e06d..f4cad554c 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -1,5 +1,13 @@ """ Contains the raw pydantic schema for the configuration file. + +Using Pydantic as our backing config parser allows us to declaratively +type our config, giving us more robust user errors with guarantees +that parts of the config exist after parsing it through Pydantic. + +We declare models using two classes here: +- `BaseModel` (docs: https://docs.pydantic.dev/latest/api/base_model/) +- `CaseInsensitiveEnum` (see ./util_enum.py) """ import re @@ -7,9 +15,55 @@ from pydantic import AfterValidator, BaseModel, ConfigDict, Field -from spras.config.analysis_schema import Analysis from spras.config.util_enum import CaseInsensitiveEnum + +class SummaryAnalysis(BaseModel): + include: bool + + model_config = ConfigDict(extra='forbid') + +class CytoscapeAnalysis(BaseModel): + include: bool + + model_config = ConfigDict(extra='forbid') + +class MlLinkage(CaseInsensitiveEnum): + ward = 'ward' + complete = 'complete' + average = 'average' + single = 'single' + +class MlMetric(CaseInsensitiveEnum): + euclidean = 'euclidean' + manhattan = 'manhattan' + cosine = 'cosine' + +class MlAnalysis(BaseModel): + include: bool + aggregate_per_algorithm: bool = False + components: int = 2 + labels: bool = True + linkage: MlLinkage = MlLinkage.ward + metric: MlMetric = MlMetric.euclidean + + model_config = ConfigDict(extra='forbid') + +class EvaluationAnalysis(BaseModel): + include: bool + aggregate_per_algorithm: bool = False + + model_config = ConfigDict(extra='forbid') + +class Analysis(BaseModel): + summary: SummaryAnalysis = SummaryAnalysis(include=False) + cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) + ml: MlAnalysis = MlAnalysis(include=False) + evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) + + model_config = ConfigDict(extra='forbid') + + # The default length of the truncated hash used to identify parameter combinations DEFAULT_HASH_LENGTH = 7 diff --git a/spras/config/util_enum.py b/spras/config/util_enum.py index 3e73eda98..b7680222b 100644 --- a/spras/config/util_enum.py +++ b/spras/config/util_enum.py @@ -4,6 +4,10 @@ # https://stackoverflow.com/a/76883868/7589775 class CaseInsensitiveEnum(str, Enum): + """ + We prefer this over Enum to make sure the config parsing + is more relaxed when it comes to string enum values. + """ @classmethod def _missing_(cls, value: Any): if isinstance(value, str): From 5343fd0fff089f954d2b323bb0d1daf457d5a7b3 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 09:28:05 -0700 Subject: [PATCH 31/60] chore: deduplicate err --- spras/runner.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spras/runner.py b/spras/runner.py index 1235efc2c..a023a9606 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -81,8 +81,5 @@ def parse_output(algorithm: str, raw_pathway_file: str, standardized_pathway_fil @param raw_pathway_file: pathway file produced by an algorithm's run function @param standardized_pathway_file: the same pathway written in the universal format """ - try: - algorithm_runner = get_algorithm(algorithm) - except KeyError as exc: - raise NotImplementedError(f'{algorithm} is not currently supported') from exc + algorithm_runner = get_algorithm(algorithm) return algorithm_runner.parse_output(raw_pathway_file, standardized_pathway_file, params) From 3c305f4b5b4c69e2eec44eee2bc8d2e6a1a92cfe Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 16:44:39 +0000 Subject: [PATCH 32/60] docs: use concepts link --- spras/config/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index f4cad554c..63fe1b613 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -6,7 +6,7 @@ that parts of the config exist after parsing it through Pydantic. We declare models using two classes here: -- `BaseModel` (docs: https://docs.pydantic.dev/latest/api/base_model/) +- `BaseModel` (docs: https://docs.pydantic.dev/latest/concepts/models/) - `CaseInsensitiveEnum` (see ./util_enum.py) """ From 49e50a03e85a82949e7913bb66a34c70195cb70a Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 18:13:59 +0000 Subject: [PATCH 33/60] refactor: mv util_enum -> util --- spras/config/schema.py | 2 +- spras/config/{util_enum.py => util.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename spras/config/{util_enum.py => util.py} (100%) diff --git a/spras/config/schema.py b/spras/config/schema.py index 63fe1b613..10991ad2f 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -15,7 +15,7 @@ from pydantic import AfterValidator, BaseModel, ConfigDict, Field -from spras.config.util_enum import CaseInsensitiveEnum +from spras.config.util import CaseInsensitiveEnum class SummaryAnalysis(BaseModel): diff --git a/spras/config/util_enum.py b/spras/config/util.py similarity index 100% rename from spras/config/util_enum.py rename to spras/config/util.py From cb28f61396c90415611ae970f4d7fdd7b924519b Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 11:14:19 -0700 Subject: [PATCH 34/60] docs: correct util_enum path --- spras/config/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index 10991ad2f..623c9dd9b 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -7,7 +7,7 @@ We declare models using two classes here: - `BaseModel` (docs: https://docs.pydantic.dev/latest/concepts/models/) -- `CaseInsensitiveEnum` (see ./util_enum.py) +- `CaseInsensitiveEnum` (see ./util.py) """ import re From 647f947636b061449996f0eff9cbb6af9bc450c3 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 11:27:29 -0700 Subject: [PATCH 35/60] feat: rough draft of args design --- spras/allpairs.py | 13 ++++++------ spras/btb.py | 32 +++++++++--------------------- spras/config/util.py | 8 ++++++++ spras/domino.py | 47 +++++++++++++++++++------------------------- spras/prm.py | 20 ++++++++++++++----- 5 files changed, 59 insertions(+), 61 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index 222794dbb..b1ffe2ee9 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -1,6 +1,7 @@ import warnings from pathlib import Path +from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( @@ -14,7 +15,7 @@ __all__ = ['AllPairs'] -class AllPairs(PRM): +class AllPairs(PRM[Empty]): required_inputs = ['nodetypes', 'network', 'directed_flag'] dois = [] @@ -71,7 +72,7 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(nodetypes=None, network=None, directed_flag=None, output_file=None, container_framework="docker"): + def run(inputs, args, output_file, container_framework="docker"): """ Run All Pairs Shortest Paths with Docker @param nodetypes: input node types with sources and targets (required) @@ -79,7 +80,7 @@ def run(nodetypes=None, network=None, directed_flag=None, output_file=None, cont @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) @param output_file: path to the output pathway file (required) """ - if not nodetypes or not network or not output_file or not directed_flag: + if not inputs["nodetypes"] or not inputs["network"] or not inputs["directed_flag"]: raise ValueError('Required All Pairs Shortest Paths arguments are missing') work_dir = '/apsp' @@ -87,10 +88,10 @@ def run(nodetypes=None, network=None, directed_flag=None, output_file=None, cont # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(nodetypes, work_dir) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir) volumes.append(bind_path) # Create the parent directories for the output file if needed @@ -103,7 +104,7 @@ def run(nodetypes=None, network=None, directed_flag=None, output_file=None, cont '--network', network_file, '--nodes', node_file, '--output', mapped_out_file] - if Path(directed_flag).read_text().strip() == "true": + if Path(inputs["directed_flag"]).read_text().strip() == "true": command.append("--directed") container_suffix = "allpairs:v4" diff --git a/spras/btb.py b/spras/btb.py index 35d33bb72..a4098ee08 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -1,5 +1,6 @@ from pathlib import Path +from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, @@ -23,19 +24,13 @@ Interactor1 Interactor2 Weight """ -class BowTieBuilder(PRM): +class BowTieBuilder(PRM[Empty]): required_inputs = ['sources', 'targets', 'edges'] dois = ["10.1186/1752-0509-3-67"] #generate input taken from meo.py beacuse they have same input requirements @staticmethod def generate_inputs(data, filename_map): - """ - Access fields from the dataset and write the required input files - @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: - """ for input_type in BowTieBuilder.required_inputs: if input_type not in filename_map: raise ValueError(f"{input_type} filename is missing") @@ -70,30 +65,21 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, container_framework="docker"): - """ - Run BTB with Docker - @param sources: input source file (required) - @param targets: input target file (required) - @param edges: input edge file (required) - @param output_file: path to the output pathway file (required) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - + def run(inputs, args, output_file, container_framework="docker"): # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide - if not sources or not targets or not edges or not output_file: + if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required BowTieBuilder arguments are missing') - if not Path(sources).exists() or not Path(targets).exists() or not Path(edges).exists(): + if not Path(inputs["sources"]).exists() or not Path(inputs["targets"]).exists() or not Path(inputs["edges"]).exists(): raise ValueError('Missing input file') # Testing for btb index errors # TODO: This error will never actually occur if the inputs are passed through # `generate_inputs`. See the discussion about removing this or making this a habit at # https://github.com/Reed-CompBio/spras/issues/306. - with open(edges, 'r') as edge_file: + with open(inputs["edges"], 'r') as edge_file: try: for line in edge_file: line = line.strip().split('\t')[2] @@ -107,13 +93,13 @@ def run(sources=None, targets=None, edges=None, output_file=None, container_fram # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(sources, work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(edges, work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir) volumes.append(bind_path) # Use its --output argument to set the output file prefix to specify an absolute path and prefix diff --git a/spras/config/util.py b/spras/config/util.py index b7680222b..c23374a50 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -1,4 +1,5 @@ from enum import Enum +from pydantic import BaseModel, ConfigDict from typing import Any @@ -17,3 +18,10 @@ def _missing_(cls, value: Any): if member.lower() == value: return member return None + + +class Empty(BaseModel): + """ + The empty base model. Used for specifying that an algorithm takes no parameters. + """ + model_config = ConfigDict(extra="forbid") diff --git a/spras/domino.py b/spras/domino.py index 5205a81cd..a70f1a1e3 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -2,6 +2,8 @@ from pathlib import Path import pandas as pd +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -16,6 +18,14 @@ ID_PREFIX = 'ENSG0' ID_PREFIX_LEN = len(ID_PREFIX) +class DominoParams(BaseModel): + module_threshold: Optional[float] + "the p-value threshold for considering a slice as relevant (optional)" + + slice_threshold: Optional[float] + "the p-value threshold for considering a putative module as final module (optional)" + + model_config = ConfigDict(use_attribute_docstrings=True) """ DOMINO will construct a fully undirected graph from the provided input file @@ -26,18 +36,12 @@ - the expected raw input file should have node pairs in the 1st and 3rd columns, with a 'ppi' in the 2nd column - it can include repeated and bidirectional edges """ -class DOMINO(PRM): +class DOMINO(PRM[DominoParams]): required_inputs = ['network', 'active_genes'] dois = ["10.15252/msb.20209593"] @staticmethod def generate_inputs(data, filename_map): - """ - Access fields from the dataset and write the required input files - @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: - """ for input_type in DOMINO.required_inputs: if input_type not in filename_map: raise ValueError(f"{input_type} filename is missing") @@ -72,20 +76,9 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(network=None, active_genes=None, output_file=None, slice_threshold=None, module_threshold=None, container_framework="docker"): - """ - Run DOMINO with Docker. - Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. - DOMINO produces multiple output module files in an HTML format. SPRAS concatenates these files into one file. - @param network: input network file (required) - @param active_genes: input active genes (required) - @param output_file: path to the output pathway file (required) - @param slice_threshold: the p-value threshold for considering a slice as relevant (optional) - @param module_threshold: the p-value threshold for considering a putative module as final module (optional) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - - if not network or not active_genes or not output_file: + def run(inputs, args, output_file, container_framework="docker"): + # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. + if not inputs["network"] or not inputs["active_genes"]: raise ValueError('Required DOMINO arguments are missing') work_dir = '/spras' @@ -93,10 +86,10 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, # Each volume is a tuple (source, destination) volumes = list() - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir) volumes.append(bind_path) - bind_path, node_file = prepare_volume(active_genes, work_dir) + bind_path, node_file = prepare_volume(inputs["active_genes"], work_dir) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -132,11 +125,11 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, '--visualization', 'true'] # Add optional arguments - if slice_threshold is not None: + if args.slice_threshold is not None: # DOMINO readme has the wrong argument https://github.com/Shamir-Lab/DOMINO/issues/12 - domino_command.extend(['--slice_threshold', str(slice_threshold)]) - if module_threshold is not None: - domino_command.extend(['--module_threshold', str(module_threshold)]) + domino_command.extend(['--slice_threshold', str(args.slice_threshold)]) + if args.module_threshold is not None: + domino_command.extend(['--module_threshold', str(args.module_threshold)]) run_container_and_log('DOMINO', container_framework, diff --git a/spras/prm.py b/spras/prm.py index b5d8501dd..06d005b2a 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,11 +1,12 @@ -import typing from abc import ABC, abstractmethod -from typing import Any +from pydantic import BaseModel +from typing import Any, cast, TypeVar, Generic from spras.dataset import Dataset +T = TypeVar('T', bound=BaseModel) -class PRM(ABC): +class PRM(ABC, Generic[T]): """ The PRM (Pathway Reconstruction Module) class, which defines the interface that `runner.py` uses to handle @@ -15,7 +16,7 @@ class PRM(ABC): required_inputs: list[str] = [] # DOIs aren't strictly required (e.g. local neighborhood), # but it should be explicitly declared that there are no DOIs. - dois: list[str] = typing.cast(list[str], None) + dois: list[str] = cast(list[str], None) def __init_subclass__(cls): # modified from https://stackoverflow.com/a/58206480/7589775 @@ -30,11 +31,20 @@ def __init_subclass__(cls): @staticmethod @abstractmethod def generate_inputs(data: Dataset, filename_map: dict[str, str]): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + """ raise NotImplementedError @staticmethod @abstractmethod - def run(**kwargs): + def run(inputs: dict[str, str], args: T, output_file: str, container_framework="docker"): + """ + Runs an algorithm with the specified inputs, algorithm params (T), + the designated output_file, and the desired container_framework. + """ raise NotImplementedError @staticmethod From 76011e07978d38fd41dded82434a3e8a5f210154 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 18:57:10 +0000 Subject: [PATCH 36/60] feat: type oi1/oi2, rwr/strwr --- spras/omicsintegrator1.py | 103 ++++++++++++++++++++++++-------------- spras/omicsintegrator2.py | 76 ++++++++++++++++++++-------- spras/rwr.py | 24 ++++++--- spras/strwr.py | 27 ++++++---- 4 files changed, 152 insertions(+), 78 deletions(-) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 0d3eb4bfd..7a69a01d6 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,4 +1,6 @@ from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed @@ -35,8 +37,47 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi f.write('processes = 1\n') f.write('threads = 1\n') +class OmicsIntegrator1Params(BaseModel): + dummy_mode: Optional[str] + mu_squared: Optional[str] + exclude_terms: Optional[str] -class OmicsIntegrator1(PRM): + noisy_edges: Optional[str] + "How many times you would like to add noise to the given edge values and re-run the algorithm." + + shuffled_prizes: Optional[int] + "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run" + + random_terminals: Optional[int] + "How many times to apply the given prizes to random nodes in the interactome" + + seed: Optional[str] + "the randomness seed to use" + + w: Optional[float] + "the number of trees" + + b: Optional[str] + "the trade-off between including more terminals and using less reliable edges" + + d: Optional[str] + "controls the maximum path-length from v0 to terminal nodes" + + mu: Optional[float] + "controls the degree-based negative prizes (defualt 0.0)" + + noise: Optional[str] + "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations" + + g: Optional[str] + "(Gamma) multiplicative edge penalty from degree of endpoints" + + r: Optional[str] + "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)" + + model_config = ConfigDict(use_attribute_docstrings=True) + +class OmicsIntegrator1(PRM[OmicsIntegrator1Params]): """ Omics Integrator 1 works with partially directed graphs - it takes in the universal input directly @@ -96,27 +137,12 @@ def generate_inputs(data, filename_map): with open(filename_map['dummy_nodes'], mode='w'): pass - # TODO add parameter validation # TODO add support for knockout argument # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=None, exclude_terms=None, - output_file=None, noisy_edges=None, shuffled_prizes=None, random_terminals=None, - seed=None, w=None, b=None, d=None, mu=None, noise=None, g=None, r=None, container_framework="docker"): - """ - Run Omics Integrator 1 in the Docker image with the provided parameters. - Does not support the garnet, cyto30, knockout, cv, or cv-reps arguments. - The configuration file is generated from the provided arguments. - Does not support the garnetBeta, processes, or threads configuration file parameters. - The msgpath is not required because msgsteiner is available in the Docker image. - Only the optimal forest sif file is retained. - All other output files are deleted. - @param output_file: the name of the output sif file for the optimal forest, which will overwrite any - existing file with this name - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - if edges is None or prizes is None or output_file is None or w is None or b is None or d is None: + def run(inputs, args, output_file, container_framework="docker"): + if inputs["edges"] is None or inputs["prizes"] is None or output_file is None or w is None or b is None or d is None: raise ValueError('Required Omics Integrator 1 arguments are missing') work_dir = '/spras' @@ -124,10 +150,10 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(prizes, work_dir) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir) volumes.append(bind_path) # 4 dummy mode possibilities: @@ -137,10 +163,10 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N # 4. file -> connect the dummy node to a specific list of nodes provided in a file # add dummy node file to the volume if dummy_mode is not None and it is 'file' - if dummy_mode == 'file': - if dummy_nodes is None: + if args.dummy_mode == 'file': + if inputs["dummy_nodes"] is None: raise ValueError("dummy_nodes file is required when dummy_mode is set to 'file'") - bind_path, dummy_file = prepare_volume(dummy_nodes, work_dir) + bind_path, dummy_file = prepare_volume(inputs["dummy_nodes"], work_dir) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -152,7 +178,8 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N conf_file = 'oi1-configuration.txt' conf_file_local = Path(out_dir, conf_file) # Temporary file that will be deleted after running Omics Integrator 1 - write_conf(conf_file_local, w=w, b=b, d=d, mu=mu, noise=noise, g=g, r=r) + write_conf(conf_file_local, w=args.w, b=args.b, d=args.d, mu=args.mu, + noise=args.noise, g=args.g, r=args.r) bind_path, conf_file = prepare_volume(str(conf_file_local), work_dir) volumes.append(bind_path) @@ -165,27 +192,27 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N '--outlabel', 'oi1'] # add the dummy mode argument - if dummy_mode is not None and dummy_mode: + if args.dummy_mode is not None and args.dummy_mode: # for custom dummy modes, add the file - if dummy_mode == 'file': - command.extend(['--dummyMode', dummy_file]) + if args.dummy_mode == 'file': + command.extend(['--dummyMode', inputs["dummy_file"]]) # else pass in the dummy_mode and let oi1 handle it else: - command.extend(['--dummyMode', dummy_mode]) + command.extend(['--dummyMode', args.dummy_mode]) # Add optional arguments - if mu_squared is not None and mu_squared: + if args.mu_squared is not None and args.mu_squared: command.extend(['--musquared']) - if exclude_terms is not None and exclude_terms: + if args.exclude_terms is not None and args.exclude_terms: command.extend(['--excludeTerms']) - if noisy_edges is not None: - command.extend(['--noisyEdges', str(noisy_edges)]) - if shuffled_prizes is not None: - command.extend(['--shuffledPrizes', str(shuffled_prizes)]) - if random_terminals is not None: - command.extend(['--randomTerminals', str(random_terminals)]) - if seed is not None: - command.extend(['--seed', str(seed)]) + if args.noisy_edges is not None: + command.extend(['--noisyEdges', str(args.noisy_edges)]) + if args.shuffled_prizes is not None: + command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) + if args.random_terminals is not None: + command.extend(['--randomTerminals', str(args.random_terminals)]) + if args.seed is not None: + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-1:no-conda" # no-conda version is the default run_container_and_log('Omics Integrator 1', diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 355d71bd6..42dc466cd 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -1,4 +1,6 @@ from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional import pandas as pd @@ -10,6 +12,36 @@ __all__ = ['OmicsIntegrator2'] +class OmicsIntegrator2Params(BaseModel): + w: float = 6 + "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode" + + b: float = 1 + "Beta: scaling factor of prizes" + + g: float = 20 + "Gamma: multiplicative edge penalty from degree of endpoints" + + noise: Optional[str] + "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations." + + noisy_edges: Optional[int] + "An integer specifying how many times to add noise to the given edge values and re-run." + + random_terminals: Optional[str] + "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run" + + dummy_mode: Optional[str] + """ + Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals) + "terminals" = connect to all terminals + "others" = connect to all nodes except for terminals + "all" = connect to all nodes in the interactome. + """ + + seed: Optional[str] + "The random seed to use for this run." + """ Omics Integrator 2 will construct a fully undirected graph from the provided input file - in the algorithm, it uses nx.Graph() objects, which are undirected @@ -20,11 +52,12 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column - it can include repeated and bidirectional edges """ -class OmicsIntegrator2(PRM): +class OmicsIntegrator2(PRM[OmicsIntegrator2Params]): required_inputs = ['prizes', 'edges'] # OI2 does not have a specific paper. Instead, we link to the OI1 paper. dois = ["10.1371/journal.pcbi.1004879"] + @staticmethod def generate_inputs(data: Dataset, filename_map): """ Access fields from the dataset and write the required input files. @@ -69,8 +102,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise=None, noisy_edges=None, - random_terminals=None, dummy_mode=None, seed=None, container_framework="docker"): + def run(inputs, args, output_file, container_framework="docker"): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. @@ -78,7 +110,7 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise @param output_file: the name of the output file, which will overwrite any existing file with this name @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ - if edges is None or prizes is None or output_file is None: + if inputs["edges"] is None or inputs["prizes"] is None: raise ValueError('Required Omics Integrator 2 arguments are missing') work_dir = '/spras' @@ -86,10 +118,10 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(prizes, work_dir) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -102,23 +134,23 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise '-o', mapped_out_dir, '--filename', 'oi2'] # Add optional arguments - if w is not None: - command.extend(['-w', str(w)]) - if b is not None: - command.extend(['-b', str(b)]) - if g is not None: - command.extend(['-g', str(g)]) - if noise is not None: - command.extend(['-noise', str(noise)]) - if noisy_edges is not None: - command.extend(['--noisy_edges', str(noisy_edges)]) - if random_terminals is not None: - command.extend(['--random_terminals', str(random_terminals)]) - if dummy_mode is not None: + if args.w is not None: + command.extend(['-w', str(args.w)]) + if args.w is not None: + command.extend(['-b', str(args.b)]) + if args.w is not None: + command.extend(['-g', str(args.g)]) + if args.noise is not None: + command.extend(['-noise', str(args.noise)]) + if args.noisy_edges is not None: + command.extend(['--noisy_edges', str(args.noisy_edges)]) + if args.random_terminals is not None: + command.extend(['--random_terminals', str(args.random_terminals)]) + if args.dummy_mode is not None: # This argument does not follow the other naming conventions - command.extend(['--dummyMode', str(dummy_mode)]) - if seed is not None: - command.extend(['--seed', str(seed)]) + command.extend(['--dummyMode', str(args.dummy_mode)]) + if args.seed is not None: + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-2:v2" run_container_and_log('Omics Integrator 2', diff --git a/spras/rwr.py b/spras/rwr.py index 5c08d6777..12fc5d422 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -1,4 +1,6 @@ from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional import pandas as pd @@ -10,7 +12,13 @@ __all__ = ['RWR'] -class RWR(PRM): +class RWRParams(BaseModel): + threshold: Optional[int] + alpha: Optional[float] + + model_config = ConfigDict(use_attribute_docstrings=True) + +class RWR(PRM[RWRParams]): required_inputs = ['network','nodes'] dois = [] @@ -34,11 +42,11 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(network=None, nodes=None, alpha=None, output_file=None, container_framework="docker", threshold=None): - if not nodes: + def run(inputs, args, output_file, container_framework="docker"): + if not inputs["nodes"] or not inputs["network"]: raise ValueError('Required RWR arguments are missing') - with Path(network).open() as network_f: + with Path(inputs["network"]).open() as network_f: for line in network_f: line = line.strip() endpoints = line.split("|") @@ -49,10 +57,10 @@ def run(network=None, nodes=None, alpha=None, output_file=None, container_framew # Each volume is a tuple (src, dest) volumes = list() - bind_path, nodes_file = prepare_volume(nodes, work_dir) + bind_path, nodes_file = prepare_volume(inputs["nodes"], work_dir) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir) volumes.append(bind_path) # RWR does not provide an argument to set the output directory @@ -70,8 +78,8 @@ def run(network=None, nodes=None, alpha=None, output_file=None, container_framew '--output', mapped_out_prefix] # Add alpha as an optional argument - if alpha is not None: - command.extend(['--alpha', str(alpha)]) + if args.alpha is not None: + command.extend(['--alpha', str(args.alpha)]) container_suffix = 'rwr:v1' out = run_container(container_framework, diff --git a/spras/strwr.py b/spras/strwr.py index fc8536507..6693d7f5e 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -1,6 +1,7 @@ from pathlib import Path - import pandas as pd +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container from spras.dataset import Dataset @@ -10,8 +11,14 @@ __all__ = ['ST_RWR'] +class ST_RWRParams(BaseModel): + threshold: Optional[int] + alpha: Optional[float] + + model_config = ConfigDict(use_attribute_docstrings=True) + # Note: This class is almost identical to the rwr.py file. -class ST_RWR(PRM): +class ST_RWR(PRM[ST_RWRParams]): required_inputs = ['network','sources','targets'] dois = [] @@ -36,11 +43,11 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(network=None, sources=None, targets=None, alpha=None, output_file=None, container_framework="docker", threshold=None): - if not sources or not targets or not network or not output_file: + def run(inputs, args, output_file, container_framework="docker"): + if not inputs["sources"] or not inputs["targets"] or not inputs["network"] or not output_file: raise ValueError('Required local_neighborhood arguments are missing') - with Path(network).open() as network_f: + with Path(inputs["network"]).open() as network_f: for line in network_f: line = line.strip() endpoints = line.split("|") @@ -52,13 +59,13 @@ def run(network=None, sources=None, targets=None, alpha=None, output_file=None, # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(sources, work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir) volumes.append(bind_path) # ST_RWR does not provide an argument to set the output directory @@ -77,8 +84,8 @@ def run(network=None, sources=None, targets=None, alpha=None, output_file=None, '--output', mapped_out_prefix] # Add alpha as an optional argument - if alpha is not None: - command.extend(['--alpha', str(alpha)]) + if args.alpha is not None: + command.extend(['--alpha', str(args.alpha)]) container_suffix = 'st-rwr:v1' out = run_container(container_framework, From 94b50c81fc1da7577c4f056336cb89411676da8e Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 12:27:46 -0700 Subject: [PATCH 37/60] refactor: meo, mcf, pl types --- spras/meo.py | 32 ++++++++++++++++++++++++-------- spras/mincostflow.py | 42 +++++++++++++++++++++--------------------- spras/pathlinker.py | 24 ++++++++++++++++-------- 3 files changed, 61 insertions(+), 37 deletions(-) diff --git a/spras/meo.py b/spras/meo.py index d4d79bf9f..06f041786 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -1,5 +1,7 @@ import os from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -65,6 +67,21 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, # Do not need csp.phase, csp.gen.file, or csp.sol.file because MAXCSP is not supported +class MEOParams(BaseModel): + max_path_length: Optional[str] + "the maximal length of a path from sources and targets to orient." + + local_search: Optional[str] + """ + a "Yes"/"No" parameter that enables MEO's local search functionality. + See "Improving approximations with local search" in the associated paper + for more information. + """ + + rand_restarts: Optional[int] + "The number of random restarts to do." + + model_config = ConfigDict(use_attribute_docstrings=True) """ MEO can support partially directed graphs @@ -82,7 +99,7 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, """ -class MEO(PRM): +class MEO(PRM[MEOParams]): required_inputs = ['sources', 'targets', 'edges'] dois = ["10.1093/nar/gkq1207"] @@ -126,8 +143,7 @@ def generate_inputs(data, filename_map): # TODO add parameter validation # TODO document required arguments @staticmethod - def run(edges=None, sources=None, targets=None, output_file=None, max_path_length=None, local_search=None, - rand_restarts=None, container_framework="docker"): + def run(inputs, args, output_file=None, container_framework="docker"): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -138,7 +154,7 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt @param output_file: the name of the output edge file, which will overwrite any existing file with this name @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ - if edges is None or sources is None or targets is None or output_file is None: + if inputs["edges"] is None or inputs["sources"] is None or inputs["targets"] is None: raise ValueError('Required Maximum Edge Orientation arguments are missing') work_dir = '/spras' @@ -146,13 +162,13 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) volumes.append(bind_path) - bind_path, source_file = prepare_volume(sources, work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -171,7 +187,7 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt properties_file_local = Path(out_dir, properties_file) write_properties(filename=properties_file_local, edges=edge_file, sources=source_file, targets=target_file, edge_output=mapped_output_file, path_output=mapped_path_output, - max_path_length=max_path_length, local_search=local_search, rand_restarts=rand_restarts, framework=container_framework) + max_path_length=args.max_path_length, local_search=args.local_search, rand_restarts=args.rand_restarts, framework=container_framework) bind_path, properties_file = prepare_volume(str(properties_file_local), work_dir) volumes.append(bind_path) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index d2d458b02..77f493f14 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -1,4 +1,6 @@ from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -10,6 +12,15 @@ __all__ = ['MinCostFlow'] +class MinCostFlowParams(BaseModel): + flow: Optional[float] + "amount of flow going through the graph" + + capacity: Optional[float] + "amount of capacity allowed on each edge" + + model_config = ConfigDict(use_attribute_docstrings=True) + """ MinCostFlow deals with fully directed graphs - OR Tools MCF is designed for directed graphs @@ -22,7 +33,7 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with the weight in the 3rd column - it can include repeated and bidirectional edges """ -class MinCostFlow (PRM): +class MinCostFlow(PRM[MinCostFlowParams]): required_inputs = ['sources', 'targets', 'edges'] dois = ["10.1038/s41540-020-00167-1"] @@ -60,20 +71,9 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, flow=None, capacity=None, container_framework="docker"): - """ - Run min cost flow with Docker (or singularity) - @param sources: input sources (required) - @param targets: input targets (required) - @param edges: input network file (required) - @param output_file: output file name (required) - @param flow: amount of flow going through the graph (optional) - @param capacity: amount of capacity allowed on each edge (optional) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - + def run(inputs, args, output_file, container_framework="docker"): # ensures that these parameters are required - if not sources or not targets or not edges or not output_file: + if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required MinCostFlow arguments are missing') # the data files will be mapped within this directory within the container @@ -82,13 +82,13 @@ def run(sources=None, targets=None, edges=None, output_file=None, flow=None, cap # the tuple is for mapping the sources, targets, edges, and output volumes = list() - bind_path, sources_file = prepare_volume(sources, work_dir) + bind_path, sources_file = prepare_volume(inputs["sources"], work_dir) volumes.append(bind_path) - bind_path, targets_file = prepare_volume(targets, work_dir) + bind_path, targets_file = prepare_volume(inputs["targets"], work_dir) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(edges, work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir) volumes.append(bind_path) # Create a prefix for the output filename and ensure the directory exists @@ -107,10 +107,10 @@ def run(sources=None, targets=None, edges=None, output_file=None, flow=None, cap '--output', mapped_out_prefix] # Optional arguments (extend the command if available) - if flow is not None: - command.extend(['--flow', str(flow)]) - if capacity is not None: - command.extend(['--capacity', str(capacity)]) + if args.flow is not None: + command.extend(['--flow', str(args.flow)]) + if args.capacity is not None: + command.extend(['--capacity', str(args.capacity)]) # choosing to run in docker or singularity container container_suffix = "mincostflow" diff --git a/spras/pathlinker.py b/spras/pathlinker.py index d0504c489..8852b959a 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -1,5 +1,7 @@ import warnings from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -12,6 +14,12 @@ __all__ = ['PathLinker'] +class PathLinkerParams(BaseModel): + k: Optional[int] + "path length (optional)" + + model_config = ConfigDict(use_attribute_docstrings=True) + """ Pathlinker will construct a fully directed graph from the provided input file - an edge is represented with a head and tail node, which represents the direction of the interation between two nodes @@ -22,7 +30,7 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column - it can include repeated and bidirectional edges """ -class PathLinker(PRM): +class PathLinker(PRM[PathLinkerParams]): required_inputs = ['nodetypes', 'network'] dois = ["10.1038/npjsba.2016.2", "10.1089/cmb.2012.0274"] @@ -68,20 +76,20 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(nodetypes=None, network=None, output_file=None, k=None, container_framework="docker"): + def run(inputs, args, output_file, container_framework="docker"): """ Run PathLinker with Docker @param nodetypes: input node types with sources and targets (required) @param network: input network file (required) @param output_file: path to the output pathway file (required) - @param k: path length (optional) + @param k: @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ # Add additional parameter validation # Do not require k # Use the PathLinker default # Could consider setting the default here instead - if not nodetypes or not network or not output_file: + if not inputs["nodetypes"] or not inputs["network"]: raise ValueError('Required PathLinker arguments are missing') work_dir = '/spras' @@ -89,10 +97,10 @@ def run(nodetypes=None, network=None, output_file=None, k=None, container_framew # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(nodetypes, work_dir) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir) volumes.append(bind_path) # PathLinker does not provide an argument to set the output directory @@ -111,8 +119,8 @@ def run(nodetypes=None, network=None, output_file=None, k=None, container_framew '--output', mapped_out_prefix] # Add optional argument - if k is not None: - command.extend(['-k', str(k)]) + if args.k is not None: + command.extend(['-k', str(args.k)]) container_suffix = "pathlinker:v2" run_container_and_log('PathLinker', From 09fa1bac0301b54842f05d284ba3b0a2390e3d11 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 12:51:33 -0700 Subject: [PATCH 38/60] chore: begin slowly updating --- spras/allpairs.py | 2 +- spras/btb.py | 2 +- spras/containers.py | 4 +- spras/domino.py | 8 +- spras/meo.py | 17 ++--- spras/mincostflow.py | 6 +- spras/omicsintegrator1.py | 2 +- spras/omicsintegrator2.py | 3 +- spras/pathlinker.py | 3 +- spras/prm.py | 3 +- spras/rwr.py | 2 +- spras/strwr.py | 2 +- test/AllPairs/test_ap.py | 65 +++++++--------- test/BowTieBuilder/test_btb.py | 133 ++++++++++++++++----------------- test/DOMINO/test_domino.py | 38 ++++------ test/MEO/test_meo.py | 30 ++++---- 16 files changed, 149 insertions(+), 171 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index b1ffe2ee9..15a3b17f7 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -72,7 +72,7 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args=Empty(), container_framework="docker"): """ Run All Pairs Shortest Paths with Docker @param nodetypes: input node types with sources and targets (required) diff --git a/spras/btb.py b/spras/btb.py index a4098ee08..6ad3afb69 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -65,7 +65,7 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args=Empty(), container_framework="docker"): # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide diff --git a/spras/containers.py b/spras/containers.py index 9a1568fdd..314d4bb45 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -369,7 +369,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ # Because this is called independently for each file, the same local path can be mounted to multiple volumes -def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]: +def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]: """ Makes a file on the local file system accessible within a container by mapping the local (source) path to a new container (destination) path and renaming the file to be relative to the destination path. @@ -385,7 +385,7 @@ def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PureP if not base_path.is_absolute(): raise ValueError(f'Volume base must be an absolute path: {volume_base}') - if isinstance(filename, PurePath): + if isinstance(filename, os.PathLike): filename = str(filename) filename_hash = hash_filename(filename, config.config.hash_length) diff --git a/spras/domino.py b/spras/domino.py index a70f1a1e3..187e53836 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -13,16 +13,16 @@ from spras.prm import PRM from spras.util import duplicate_edges -__all__ = ['DOMINO', 'pre_domino_id_transform', 'post_domino_id_transform'] +__all__ = ['DOMINO', 'DominoParams', 'pre_domino_id_transform', 'post_domino_id_transform'] ID_PREFIX = 'ENSG0' ID_PREFIX_LEN = len(ID_PREFIX) class DominoParams(BaseModel): - module_threshold: Optional[float] + module_threshold: Optional[float] = None "the p-value threshold for considering a slice as relevant (optional)" - slice_threshold: Optional[float] + slice_threshold: Optional[float] = None "the p-value threshold for considering a putative module as final module (optional)" model_config = ConfigDict(use_attribute_docstrings=True) @@ -76,7 +76,7 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args=DominoParams(), container_framework="docker"): # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. if not inputs["network"] or not inputs["active_genes"]: raise ValueError('Required DOMINO arguments are missing') diff --git a/spras/meo.py b/spras/meo.py index 06f041786..0451cb4c0 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -11,7 +11,7 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['MEO', 'write_properties'] +__all__ = ['MEO', 'MEOParams', 'write_properties'] # replaces all underscores in the node names with unicode seperator # MEO keeps only the substring up to the first underscore when parsing node names @@ -58,7 +58,8 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, if max_path_length is not None: f.write(f'max.path.length = {max_path_length}\n') if local_search is not None: - f.write(f'local.search = {local_search}\n') + # Yes/No for this parameter. + f.write(f'local.search = {"Yes" if local_search else "No"}\n') if rand_restarts is not None: f.write(f'rand.restarts = {rand_restarts}\n') @@ -68,17 +69,17 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, # Do not need csp.phase, csp.gen.file, or csp.sol.file because MAXCSP is not supported class MEOParams(BaseModel): - max_path_length: Optional[str] + max_path_length: Optional[int] = None "the maximal length of a path from sources and targets to orient." - local_search: Optional[str] + local_search: Optional[bool] = None """ - a "Yes"/"No" parameter that enables MEO's local search functionality. + a boolean parameter that enables MEO's local search functionality. See "Improving approximations with local search" in the associated paper for more information. """ - rand_restarts: Optional[int] + rand_restarts: Optional[int] = None "The number of random restarts to do." model_config = ConfigDict(use_attribute_docstrings=True) @@ -143,7 +144,7 @@ def generate_inputs(data, filename_map): # TODO add parameter validation # TODO document required arguments @staticmethod - def run(inputs, args, output_file=None, container_framework="docker"): + def run(inputs, args=MEOParams(), output_file=None, container_framework="docker"): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -151,8 +152,6 @@ def run(inputs, args, output_file=None, container_framework="docker"): Does not support MINSAT or MAXCSP. Only the edge output file is retained. All other output files are deleted. - @param output_file: the name of the output edge file, which will overwrite any existing file with this name - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ if inputs["edges"] is None or inputs["sources"] is None or inputs["targets"] is None: raise ValueError('Required Maximum Edge Orientation arguments are missing') diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 77f493f14..986c1c8eb 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -13,10 +13,10 @@ __all__ = ['MinCostFlow'] class MinCostFlowParams(BaseModel): - flow: Optional[float] + flow: Optional[float] = None "amount of flow going through the graph" - capacity: Optional[float] + capacity: Optional[float] = None "amount of capacity allowed on each edge" model_config = ConfigDict(use_attribute_docstrings=True) @@ -71,7 +71,7 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args=MinCostFlowParams(), container_framework="docker"): # ensures that these parameters are required if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required MinCostFlow arguments are missing') diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 7a69a01d6..3361f5d2a 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -141,7 +141,7 @@ def generate_inputs(data, filename_map): # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args, container_framework="docker"): if inputs["edges"] is None or inputs["prizes"] is None or output_file is None or w is None or b is None or d is None: raise ValueError('Required Omics Integrator 1 arguments are missing') diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 42dc466cd..20351833e 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -98,11 +98,10 @@ def generate_inputs(data: Dataset, filename_map): edges_df.to_csv(filename_map['edges'], sep='\t', index=False, columns=['Interactor1', 'Interactor2', 'cost'], header=['protein1', 'protein2', 'cost']) - # TODO add parameter validation # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args, container_framework="docker"): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 8852b959a..3c78ffb84 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -74,9 +74,8 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map["network"],sep="\t",index=False,columns=["Interactor1","Interactor2","Weight"], header=["#Interactor1","Interactor2","Weight"]) - # Skips parameter validation step @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args, container_framework="docker"): """ Run PathLinker with Docker @param nodetypes: input node types with sources and targets (required) diff --git a/spras/prm.py b/spras/prm.py index 06d005b2a..1692f11f6 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from pydantic import BaseModel from typing import Any, cast, TypeVar, Generic +import os from spras.dataset import Dataset @@ -40,7 +41,7 @@ def generate_inputs(data: Dataset, filename_map: dict[str, str]): @staticmethod @abstractmethod - def run(inputs: dict[str, str], args: T, output_file: str, container_framework="docker"): + def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_framework="docker"): """ Runs an algorithm with the specified inputs, algorithm params (T), the designated output_file, and the desired container_framework. diff --git a/spras/rwr.py b/spras/rwr.py index 12fc5d422..12df71e01 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -42,7 +42,7 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args, container_framework="docker"): if not inputs["nodes"] or not inputs["network"]: raise ValueError('Required RWR arguments are missing') diff --git a/spras/strwr.py b/spras/strwr.py index 6693d7f5e..c603f9196 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -43,7 +43,7 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args, container_framework="docker"): if not inputs["sources"] or not inputs["targets"] or not inputs["network"] or not output_file: raise ValueError('Required local_neighborhood arguments are missing') diff --git a/test/AllPairs/test_ap.py b/test/AllPairs/test_ap.py index 8d094561f..ee76d0ce7 100644 --- a/test/AllPairs/test_ap.py +++ b/test/AllPairs/test_ap.py @@ -45,11 +45,10 @@ def test_allpairs(self): out_path = OUT_DIR.joinpath('sample-out.txt') out_path.unlink(missing_ok=True) # Only include required arguments - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path) + AllPairs.run({"nodetypes": str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input' / 'sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path) ) assert out_path.exists() @@ -57,9 +56,8 @@ def test_allpairs_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No nodetypes - AllPairs.run( - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - output_file=str(OUT_DIR / 'sample-out.txt')) + AllPairs.run({"network": str(TEST_DIR / 'input' / 'sample-in-net.txt')}, + output_file=str(OUT_DIR / 'sample-out.txt')) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -68,12 +66,11 @@ def test_allpairs_singularity(self): out_path = OUT_DIR / 'sample-out.txt' out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path), - container_framework="singularity") + AllPairs.run({"nodetypes": str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input' / 'sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path), + container_framework="singularity") assert out_path.exists() @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') @@ -82,12 +79,11 @@ def test_allpairs_singularity_unpacked(self): out_path.unlink(missing_ok=True) # Indicate via config mechanism that we want to unpack the Singularity container config.config.unpack_singularity = True - AllPairs.run( - nodetypes=str(TEST_DIR / 'input/sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input/sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path), - container_framework="singularity") + AllPairs.run({"nodetypes": str(TEST_DIR / 'input/sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input/sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path), + container_framework="singularity") config.config.unpack_singularity = False assert out_path.exists() @@ -104,12 +100,10 @@ def test_allpairs_correctness(self): out_path = OUT_DIR / 'correctness-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'correctness-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'correctness-network.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(OUT_DIR / 'correctness-out.txt') - ) + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'correctness-nodetypes.txt', + "network": TEST_DIR / 'input' / 'correctness-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-false.txt'}, + output_file=OUT_DIR / 'correctness-out.txt') edge_equality_test_util(out_path, EXPECTED_DIR / 'correctness-expected.txt') @@ -117,12 +111,10 @@ def test_allpairs_directed(self): out_path = OUT_DIR / 'directed-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'directed-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'directed-network.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-true.txt'), - output_file=str(OUT_DIR / 'directed-out.txt'), - ) + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'directed-nodetypes.txt', + "network": TEST_DIR / 'input' / 'directed-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-true.txt'}, + output_file=OUT_DIR / 'directed-out.txt') edge_equality_test_util(out_path, EXPECTED_DIR.joinpath('directed-expected.txt')) @@ -136,11 +128,10 @@ def test_allpairs_zero_length(self): out_path = OUT_DIR / 'zero-length-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=TEST_DIR / 'input' / 'zero-length-nodetypes.txt', - network=TEST_DIR / 'input' / 'zero-length-network.txt', - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=OUT_DIR / 'zero-length-out.txt' + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'zero-length-nodetypes.txt', + "network": TEST_DIR / 'input' / 'zero-length-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-false.txt'}, + output_file=OUT_DIR / 'zero-length-out.txt' ) assert filecmp.cmp(OUT_DIR / 'zero-length-out.txt', EXPECTED_DIR / 'zero-length-expected.txt', shallow=False) diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index d4a458b3c..c65ce4a32 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -25,22 +25,19 @@ class TestBowTieBuilder: def test_btb_missing(self): with pytest.raises(ValueError): # No edges - BTB.run( - targets=Path(TEST_DIR, 'input', 'target.txt'), - sources=Path(TEST_DIR, 'input', 'source.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), + "sources": Path(TEST_DIR, 'input', 'source.txt')}, + output_file=OUT_FILE_DEFAULT) with pytest.raises(ValueError): # No source - BTB.run( - targets=Path(TEST_DIR, 'input', 'target.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), + "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + output_file=OUT_FILE_DEFAULT) with pytest.raises(ValueError): # No target - BTB.run( - sources=Path(TEST_DIR, 'input', 'source.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'source.txt'), + "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ @@ -48,30 +45,30 @@ def test_btb_missing(self): """ def test_btb_file(self): with pytest.raises(ValueError): - BTB.run(sources=Path(TEST_DIR, 'input', 'unknown.txt'), - targets=Path(TEST_DIR, 'input', 'target.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'unknown.txt'), + "targets": Path(TEST_DIR, 'input', 'target.txt'), + "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ Run the BowTieBuilder algorithm with bad input data """ def test_format_error(self): with pytest.raises(IndexError): - BTB.run(sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - edges=Path(TEST_DIR, 'input', 'bad-edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt'), + "edges": Path(TEST_DIR, 'input', 'bad-edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ Run the BowTieBuilder algorithm on the example input files and check the output matches the expected output """ def test_btb(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'btb-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'btb-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'btb-output.txt') @@ -89,10 +86,10 @@ def test_btb(self): """ def test_disjoint(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'disjoint-edges.txt'), - sources=Path(TEST_DIR, 'input', 'disjoint-sources.txt'), - targets=Path(TEST_DIR, 'input', 'disjoint-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'disjoint-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'disjoint-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'disjoint-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'disjoint-output.txt') @@ -110,10 +107,10 @@ def test_disjoint(self): """ def test_disjoint2(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'disjoint2-edges.txt'), - sources=Path(TEST_DIR, 'input', 'disjoint-sources.txt'), - targets=Path(TEST_DIR, 'input', 'disjoint-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'disjoint2-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'disjoint-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'disjoint-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'disjoint-output.txt') @@ -132,10 +129,10 @@ def test_disjoint2(self): def test_missing_file(self): with pytest.raises(ValueError): with pytest.raises(OSError): - BTB.run(edges=Path(TEST_DIR, 'input', 'missing.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'missing.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) """ @@ -143,10 +140,10 @@ def test_missing_file(self): """ def test_source_to_source(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source-output.txt') @@ -164,10 +161,10 @@ def test_source_to_source(self): """ def test_source_to_source2(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source2-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source2-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source2-output.txt') @@ -186,10 +183,10 @@ def test_source_to_source2(self): def test_source_to_source_disjoint(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source-disjoint-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source-disjoint-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source-disjoint-output.txt') @@ -208,10 +205,10 @@ def test_source_to_source_disjoint(self): def test_bidirectional(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'bidirectional-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'bidirectional-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'bidirectional-output.txt') @@ -230,10 +227,10 @@ def test_bidirectional(self): def test_target_to_source(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'target-to-source-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'target-to-source-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'empty-output.txt') @@ -252,10 +249,10 @@ def test_target_to_source(self): def test_loop(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'loop-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'loop-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'loop-output.txt') @@ -274,10 +271,10 @@ def test_loop(self): def test_weighted(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'weighted-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'weighted-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'weighted-output.txt') @@ -292,10 +289,10 @@ def test_weighted(self): def test_weight_one(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'weight-one-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'weight-one-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'weighted-output.txt') diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index 4323ea4c9..62563bdc3 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -5,7 +5,7 @@ import pytest import spras.config.config as config -from spras.domino import DOMINO, post_domino_id_transform, pre_domino_id_transform +from spras.domino import DOMINO, DominoParams, post_domino_id_transform, pre_domino_id_transform config.init_from_file("config/config.yaml") @@ -28,10 +28,9 @@ def test_domino_required(self): # Only include required arguments out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt', + "active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT) # output_file should be empty assert out_path.exists() @@ -39,12 +38,10 @@ def test_domino_optional(self): # Include optional arguments out_path = Path(OUT_FILE_OPTIONAL) out_path.unlink(missing_ok=True) - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_OPTIONAL, - slice_threshold=0.4, - module_threshold=0.06) + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt', + "active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_OPTIONAL, + args=DominoParams(slice_threshold=0.4, module_threshold=0.06)) # output_file should be empty assert out_path.exists() @@ -52,17 +49,15 @@ def test_domino_missing_active_genes(self): # Test the expected error is raised when active_genes argument is missing with pytest.raises(ValueError): # No active_genes - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt'}, + output_file=OUT_FILE_DEFAULT) def test_domino_missing_network(self): # Test the expected error is raised when network argument is missing with pytest.raises(ValueError): # No network - DOMINO.run( - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -71,11 +66,10 @@ def test_domino_singularity(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT, - container_framework="singularity") + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt', + "active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT, + container_framework="singularity") assert out_path.exists() def test_pre_id_transform(self): diff --git a/test/MEO/test_meo.py b/test/MEO/test_meo.py index 32958be20..051744ed7 100644 --- a/test/MEO/test_meo.py +++ b/test/MEO/test_meo.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.meo import MEO, write_properties +from spras.meo import MEO, MEOParams, write_properties config.init_from_file("config/config.yaml") @@ -20,9 +20,9 @@ def test_meo_required(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE) assert out_path.exists() @@ -30,21 +30,19 @@ def test_meo_all_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', - output_file=OUT_FILE, - max_path_length=3, - local_search='No', - rand_restarts=10) + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, + args=MEOParams(max_path_length=3, local_search=False, rand_restarts=10), + output_file=OUT_FILE) assert out_path.exists() def test_meo_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No edges - MEO.run(sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE) with pytest.raises(ValueError): @@ -62,9 +60,9 @@ def test_meo_singularity(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE, container_framework="singularity") assert out_path.exists() From 32d4b5cbce1e46a0afe5744b778350ab2b7cbae8 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 20:03:27 +0000 Subject: [PATCH 39/60] refactor: moving more tests --- spras/omicsintegrator1.py | 34 +++++----- spras/omicsintegrator2.py | 16 +++-- test/OmicsIntegrator1/test_oi1.py | 109 +++++++++++++++--------------- test/OmicsIntegrator2/test_oi2.py | 43 +++++------- 4 files changed, 97 insertions(+), 105 deletions(-) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 3361f5d2a..45465ecd6 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -7,7 +7,7 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['OmicsIntegrator1', 'write_conf'] +__all__ = ['OmicsIntegrator1', 'OmicsIntegrator1Params', 'write_conf'] # TODO decide on default number of processes and threads @@ -38,41 +38,41 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi f.write('threads = 1\n') class OmicsIntegrator1Params(BaseModel): - dummy_mode: Optional[str] - mu_squared: Optional[str] - exclude_terms: Optional[str] + dummy_mode: Optional[str] = None + mu_squared: Optional[bool] = None + exclude_terms: Optional[bool] = None - noisy_edges: Optional[str] + noisy_edges: Optional[int] = None "How many times you would like to add noise to the given edge values and re-run the algorithm." - shuffled_prizes: Optional[int] + shuffled_prizes: Optional[int] = None "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run" - random_terminals: Optional[int] + random_terminals: Optional[int] = None "How many times to apply the given prizes to random nodes in the interactome" - seed: Optional[str] + seed: Optional[int] = None "the randomness seed to use" - w: Optional[float] + w: int "the number of trees" - b: Optional[str] + b: float "the trade-off between including more terminals and using less reliable edges" - d: Optional[str] + d: int "controls the maximum path-length from v0 to terminal nodes" - mu: Optional[float] + mu: Optional[float] = None "controls the degree-based negative prizes (defualt 0.0)" - noise: Optional[str] + noise: Optional[float] = None "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations" - g: Optional[str] + g: Optional[float] = None "(Gamma) multiplicative edge penalty from degree of endpoints" - r: Optional[str] + r: Optional[float] = None "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)" model_config = ConfigDict(use_attribute_docstrings=True) @@ -142,7 +142,7 @@ def generate_inputs(data, filename_map): # TODO document required arguments @staticmethod def run(inputs, output_file, args, container_framework="docker"): - if inputs["edges"] is None or inputs["prizes"] is None or output_file is None or w is None or b is None or d is None: + if inputs["edges"] is None or inputs["prizes"] is None or output_file is None: raise ValueError('Required Omics Integrator 1 arguments are missing') work_dir = '/spras' @@ -195,7 +195,7 @@ def run(inputs, output_file, args, container_framework="docker"): if args.dummy_mode is not None and args.dummy_mode: # for custom dummy modes, add the file if args.dummy_mode == 'file': - command.extend(['--dummyMode', inputs["dummy_file"]]) + command.extend(['--dummyMode', str(inputs["dummy_file"])]) # else pass in the dummy_mode and let oi1 handle it else: command.extend(['--dummyMode', args.dummy_mode]) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 20351833e..944bf1bf7 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -10,7 +10,7 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges -__all__ = ['OmicsIntegrator2'] +__all__ = ['OmicsIntegrator2', 'OmicsIntegrator2Params'] class OmicsIntegrator2Params(BaseModel): w: float = 6 @@ -22,16 +22,16 @@ class OmicsIntegrator2Params(BaseModel): g: float = 20 "Gamma: multiplicative edge penalty from degree of endpoints" - noise: Optional[str] + noise: Optional[float] = None "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations." - noisy_edges: Optional[int] + noisy_edges: Optional[int] = None "An integer specifying how many times to add noise to the given edge values and re-run." - random_terminals: Optional[str] + random_terminals: Optional[int] = None "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run" - dummy_mode: Optional[str] + dummy_mode: Optional[str] = None """ Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals) "terminals" = connect to all terminals @@ -39,9 +39,11 @@ class OmicsIntegrator2Params(BaseModel): "all" = connect to all nodes in the interactome. """ - seed: Optional[str] + seed: Optional[int] = None "The random seed to use for this run." + model_config = ConfigDict(use_attribute_docstrings=True) + """ Omics Integrator 2 will construct a fully undirected graph from the provided input file - in the algorithm, it uses nx.Graph() objects, which are undirected @@ -101,7 +103,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(inputs, output_file, args, container_framework="docker"): + def run(inputs, output_file, args=OmicsIntegrator2Params(), container_framework="docker"): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index a484c0af3..fad4627e0 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.omicsintegrator1 import OmicsIntegrator1, write_conf +from spras.omicsintegrator1 import OmicsIntegrator1, OmicsIntegrator1Params, write_conf config.init_from_file("config/config.yaml") @@ -20,79 +20,74 @@ def test_oi1_required(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10) + args=OmicsIntegrator1Params(w=5, b=1, d=10)) assert out_path.exists() def test_oi1_some_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include optional argument - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params(w=5, b=1, d=10, noise=0.333, g=0.001, r=0)) assert out_path.exists() def test_oi1_all_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', - dummy_nodes=None, - dummy_mode='terminals', - mu_squared=True, - exclude_terms=True, + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - noisy_edges=0, - shuffled_prizes=0, - random_terminals=0, - seed=1, - w=5, - b=1, - d=10, - mu=0, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params( + dummy_mode='terminals', + mu_squared=True, + exclude_terms=True, + noisy_edges=0, + shuffled_prizes=0, + random_terminals=0, + seed=1, + w=5, + b=1, + d=10, + mu=0, + noise=0.333, + g=0.001, + r=0)) assert out_path.exists() def test_oi1_dummy_file(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include optional argument - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', - dummy_nodes=TEST_DIR + 'input/oi1-dummy.txt', - dummy_mode='file', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt', + "dummy_nodes": TEST_DIR + 'input/oi1-dummy.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params( + dummy_mode='file', + w=5, + b=1, + d=10, + noise=0.333, + g=0.001, + r=0)) assert out_path.exists() def test_oi1_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No edges - OmicsIntegrator1.run(prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=TEST_DIR+'output/test_optimalForest.sif', - w=5, - b=1, - d=10) + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10)) with pytest.raises(ValueError): # No w write_conf(Path('.'), @@ -103,13 +98,14 @@ def test_oi1_missing_dummy(self): # Test the expected error is raised when the dummy_nodes file is missing and the dummy_mode is 'file' with pytest.raises(ValueError): # No edges - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=TEST_DIR+'output/test_optimalForest.sif', - w=5, - b=1, - d=10, - dummy_mode='file') + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10, + dummy_mode='file')) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -118,11 +114,12 @@ def test_oi1_singularity(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - OmicsIntegrator1.run(edges=TEST_DIR + 'input/oi1-edges.txt', - prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR + 'input/oi1-edges.txt', + "prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10), container_framework="singularity") assert out_path.exists() diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index 13f7f30b6..0239d5e5f 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.omicsintegrator2 import OmicsIntegrator2 +from spras.omicsintegrator2 import OmicsIntegrator2, OmicsIntegrator2Params config.init_from_file("config/config.yaml") @@ -21,51 +21,44 @@ class TestOmicsIntegrator2: def test_oi2_required(self): # Only include required arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE) assert OUT_FILE.exists() def test_oi2_some_optional(self): # Include optional argument OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, - g=0) + args=OmicsIntegrator2Params(g=0)) assert OUT_FILE.exists() def test_oi2_all_optional(self): # Include all optional arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, - w=5, - b=1, - g=3, - noise=0.1, - noisy_edges=0, - random_terminals=0, - dummy_mode='terminals', - seed=2) + args=OmicsIntegrator2Params(w=5, + b=1, + g=3, + noise=0.1, + noisy_edges=0, + random_terminals=0, + dummy_mode='terminals', + seed=2)) assert OUT_FILE.exists() - def test_oi2_missing(self): - # Test the expected error is raised when required arguments are missing - with pytest.raises(ValueError): - # No output_file - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE) - # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') def test_oi2_singularity(self): # Only include required arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, container_framework="singularity") assert OUT_FILE.exists() From 9b539e99fe7f4f53549e4e295fe0d3bf6bce39ed Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 20:28:50 +0000 Subject: [PATCH 40/60] fix: correct params --- spras/config/util.py | 3 +- spras/domino.py | 2 +- spras/meo.py | 5 ++- spras/mincostflow.py | 5 ++- spras/omicsintegrator1.py | 3 +- spras/omicsintegrator2.py | 2 +- spras/pathlinker.py | 9 +++-- spras/prm.py | 5 ++- spras/rwr.py | 11 +++-- spras/strwr.py | 12 ++++-- test/DOMINO/test_domino.py | 7 +++- test/MinCostFlow/test_mcf.py | 65 ++++++++++++++---------------- test/OmicsIntegrator2/test_oi2.py | 9 ++--- test/PathLinker/test_pathlinker.py | 36 +++++++---------- test/RWR/test_RWR.py | 26 ++++++------ test/ST_RWR/test_STRWR.py | 34 ++++++++-------- 16 files changed, 120 insertions(+), 114 deletions(-) diff --git a/spras/config/util.py b/spras/config/util.py index c23374a50..32f19076f 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -1,7 +1,8 @@ from enum import Enum -from pydantic import BaseModel, ConfigDict from typing import Any +from pydantic import BaseModel, ConfigDict + # https://stackoverflow.com/a/76883868/7589775 class CaseInsensitiveEnum(str, Enum): diff --git a/spras/domino.py b/spras/domino.py index 187e53836..86f3c0563 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -1,9 +1,9 @@ import json from pathlib import Path +from typing import Optional import pandas as pd from pydantic import BaseModel, ConfigDict -from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( diff --git a/spras/meo.py b/spras/meo.py index 0451cb4c0..30e81d87e 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -1,8 +1,9 @@ import os from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional +from pydantic import BaseModel, ConfigDict + from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( add_directionality_constant, @@ -78,7 +79,7 @@ class MEOParams(BaseModel): See "Improving approximations with local search" in the associated paper for more information. """ - + rand_restarts: Optional[int] = None "The number of random restarts to do." diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 986c1c8eb..eab80c631 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -1,7 +1,8 @@ from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional +from pydantic import BaseModel, ConfigDict + from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, @@ -10,7 +11,7 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['MinCostFlow'] +__all__ = ['MinCostFlow', 'MinCostFlowParams'] class MinCostFlowParams(BaseModel): flow: Optional[float] = None diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 45465ecd6..d8226f735 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,7 +1,8 @@ from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional +from pydantic import BaseModel, ConfigDict + from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed from spras.prm import PRM diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 944bf1bf7..41aec9ee1 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -1,8 +1,8 @@ from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional import pandas as pd +from pydantic import BaseModel, ConfigDict from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 3c78ffb84..167403cef 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -1,8 +1,9 @@ import warnings from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional +from pydantic import BaseModel, ConfigDict + from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( @@ -12,10 +13,10 @@ from spras.prm import PRM from spras.util import duplicate_edges, raw_pathway_df -__all__ = ['PathLinker'] +__all__ = ['PathLinker', 'PathLinkerParams'] class PathLinkerParams(BaseModel): - k: Optional[int] + k: Optional[int] = None "path length (optional)" model_config = ConfigDict(use_attribute_docstrings=True) @@ -75,7 +76,7 @@ def generate_inputs(data, filename_map): header=["#Interactor1","Interactor2","Weight"]) @staticmethod - def run(inputs, output_file, args, container_framework="docker"): + def run(inputs, output_file, args=PathLinkerParams(), container_framework="docker"): """ Run PathLinker with Docker @param nodetypes: input node types with sources and targets (required) diff --git a/spras/prm.py b/spras/prm.py index 1692f11f6..73c94454a 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,7 +1,8 @@ +import os from abc import ABC, abstractmethod +from typing import Any, Generic, TypeVar, cast + from pydantic import BaseModel -from typing import Any, cast, TypeVar, Generic -import os from spras.dataset import Dataset diff --git a/spras/rwr.py b/spras/rwr.py index 12df71e01..ba78589ec 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -1,8 +1,8 @@ from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional import pandas as pd +from pydantic import BaseModel, ConfigDict from spras.containers import prepare_volume, run_container from spras.dataset import Dataset @@ -10,11 +10,14 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['RWR'] +__all__ = ['RWR', 'RWRParams'] class RWRParams(BaseModel): - threshold: Optional[int] - alpha: Optional[float] + threshold: int + "The number of nodes to return" + + alpha: Optional[float] = None + "The chance of a restart during the random walk" model_config = ConfigDict(use_attribute_docstrings=True) diff --git a/spras/strwr.py b/spras/strwr.py index c603f9196..37590e7c6 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -1,7 +1,8 @@ from pathlib import Path +from typing import Optional + import pandas as pd from pydantic import BaseModel, ConfigDict -from typing import Optional from spras.containers import prepare_volume, run_container from spras.dataset import Dataset @@ -9,11 +10,14 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['ST_RWR'] +__all__ = ['ST_RWR', 'ST_RWRParams'] class ST_RWRParams(BaseModel): - threshold: Optional[int] - alpha: Optional[float] + threshold: int + "The number of nodes to return" + + alpha: Optional[float] = None + "The chance of a restart during the random walk" model_config = ConfigDict(use_attribute_docstrings=True) diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index 62563bdc3..e84c0df8b 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -5,7 +5,12 @@ import pytest import spras.config.config as config -from spras.domino import DOMINO, DominoParams, post_domino_id_transform, pre_domino_id_transform +from spras.domino import ( + DOMINO, + DominoParams, + post_domino_id_transform, + pre_domino_id_transform, +) config.init_from_file("config/config.yaml") diff --git a/test/MinCostFlow/test_mcf.py b/test/MinCostFlow/test_mcf.py index c777a665d..1c9c61a60 100644 --- a/test/MinCostFlow/test_mcf.py +++ b/test/MinCostFlow/test_mcf.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.mincostflow import MinCostFlow +from spras.mincostflow import MinCostFlow, MinCostFlowParams config.init_from_file("config/config.yaml") @@ -21,9 +21,9 @@ def test_mincostflow_required(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE) assert out_path.exists() # TODO: assert for the output .equals expected_output instead of only testing @@ -34,11 +34,11 @@ def test_mincostflow_missing_capacity(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1) + args=MinCostFlowParams(flow=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -46,11 +46,11 @@ def test_mincostflow_missing_flow(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - capacity=1) + args=MinCostFlowParams(capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -59,24 +59,22 @@ def test_mincostflow_too_much_flow(self, graph): out_path.unlink(missing_ok=True) with pytest.raises(RuntimeError): - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=50, - capacity=1) + args=MinCostFlowParams(flow=50, capacity=1)) @pytest.mark.parametrize('graph', ['graph1']) def test_mincostflow_no_flow(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=0, - capacity=1) + args=MinCostFlowParams(flow=0, capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -84,20 +82,19 @@ def test_mincostflow_all_optional(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1, - capacity=1) + args=MinCostFlowParams(flow=1, capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) def test_mincostflow_missing(self, graph): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt'}, output_file=OUT_FILE) @pytest.mark.parametrize('graph', ['graph1']) @@ -106,12 +103,10 @@ def test_mincostflow_singularity(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1, - capacity=1, + args=MinCostFlowParams(flow=1, capacity=1), container_framework="singularity") assert out_path.exists() - diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index 0239d5e5f..172197efd 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -8,11 +8,10 @@ config.init_from_file("config/config.yaml") -TEST_DIR = 'test/OmicsIntegrator2/' -EDGE_FILE = TEST_DIR+'input/oi2-edges.txt' -PRIZE_FILE = TEST_DIR+'input/oi2-prizes.txt' -OUT_FILE = Path(TEST_DIR, 'output', 'test.tsv') - +TEST_DIR = Path('test', 'OmicsIntegrator2') +EDGE_FILE = TEST_DIR / 'input' / 'oi2-edges.txt' +PRIZE_FILE = TEST_DIR / 'input' / 'oi2-prizes.txt' +OUT_FILE = TEST_DIR / 'output' / 'test.tsv' class TestOmicsIntegrator2: """ diff --git a/test/PathLinker/test_pathlinker.py b/test/PathLinker/test_pathlinker.py index ed9f10670..67e4b598f 100644 --- a/test/PathLinker/test_pathlinker.py +++ b/test/PathLinker/test_pathlinker.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.pathlinker import PathLinker +from spras.pathlinker import PathLinker, PathLinkerParams config.init_from_file("config/config.yaml") @@ -21,33 +21,28 @@ def test_pathlinker_required(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_DEFAULT - ) + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_DEFAULT) assert out_path.exists() def test_pathlinker_optional(self): out_path = Path(OUT_FILE_100) out_path.unlink(missing_ok=True) # Include optional argument - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_100, - k=100 - ) + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_100, + args=PathLinkerParams(k=100)) assert out_path.exists() def test_pathlinker_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No nodetypes - PathLinker.run( - network=TEST_DIR + 'input/sample-in-net.txt', - output_file=OUT_FILE_100, - k=100) + PathLinker.run({"network": TEST_DIR + 'input/sample-in-net.txt'}, + output_file=OUT_FILE_100, + args=PathLinkerParams(k=100)) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -56,9 +51,8 @@ def test_pathlinker_singularity(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_DEFAULT, - container_framework="singularity") + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_DEFAULT, + container_framework="singularity") assert out_path.exists() diff --git a/test/RWR/test_RWR.py b/test/RWR/test_RWR.py index b0316ded0..70eb06845 100644 --- a/test/RWR/test_RWR.py +++ b/test/RWR/test_RWR.py @@ -5,7 +5,7 @@ import pytest import spras.config.config as config -from spras.rwr import RWR +from spras.rwr import RWR, RWRParams config.init_from_file("config/config.yaml") @@ -19,9 +19,9 @@ class TestRWR: """ def test_rwr(self): OUT_FILE.unlink(missing_ok=True) - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) assert OUT_FILE.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected_output', 'rwr-output.txt') @@ -32,9 +32,9 @@ def test_rwr(self): """ def test_missing_file(self): with pytest.raises(OSError): - RWR.run(network=Path(TEST_DIR, 'input', 'missing.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'missing.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) """ @@ -42,9 +42,9 @@ def test_missing_file(self): """ def test_format_error(self): with pytest.raises(ValueError): - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-bad-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-bad-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) # Only run Singularity test if the binary is available on the system @@ -53,9 +53,9 @@ def test_format_error(self): def test_rwr_singularity(self): OUT_FILE.unlink(missing_ok=True) # Only include required arguments and run with Singularity - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE, container_framework="singularity") assert OUT_FILE.exists() diff --git a/test/ST_RWR/test_STRWR.py b/test/ST_RWR/test_STRWR.py index 898b24055..ea0c2bda0 100644 --- a/test/ST_RWR/test_STRWR.py +++ b/test/ST_RWR/test_STRWR.py @@ -5,7 +5,7 @@ import pytest import spras.config.config as config -from spras.strwr import ST_RWR +from spras.strwr import ST_RWR, ST_RWRParams config.init_from_file("config/config.yaml") @@ -20,10 +20,10 @@ class TestSTRWR: """ def test_strwr(self): OUT_FILE.unlink(missing_ok=True) - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) assert OUT_FILE.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected_output', 'strwr-output.txt') @@ -34,10 +34,10 @@ def test_strwr(self): """ def test_missing_file(self): with pytest.raises(OSError): - ST_RWR.run(network=Path(TEST_DIR, 'input', 'missing.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'missing.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) """ @@ -45,10 +45,10 @@ def test_missing_file(self): """ def test_format_error(self): with pytest.raises(ValueError): - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-bad-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-bad-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) # Only run Singularity test if the binary is available on the system @@ -57,10 +57,10 @@ def test_format_error(self): def test_strwr_singularity(self): OUT_FILE.unlink(missing_ok=True) # Only include required arguments and run with Singularity - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE, container_framework="singularity") assert OUT_FILE.exists() From da6771166f36fb9d1a1f9d8b651296de02546fd1 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 20:47:08 +0000 Subject: [PATCH 41/60] fix: specify default args out of run --- spras/allpairs.py | 2 +- spras/btb.py | 2 +- spras/domino.py | 7 +++++-- spras/meo.py | 5 ++++- spras/mincostflow.py | 5 ++++- spras/omicsintegrator1.py | 8 ++++---- spras/omicsintegrator2.py | 5 ++++- spras/pathlinker.py | 17 +++++++---------- 8 files changed, 30 insertions(+), 21 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index 15a3b17f7..670d3f721 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -72,7 +72,7 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, output_file, args=Empty(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): """ Run All Pairs Shortest Paths with Docker @param nodetypes: input node types with sources and targets (required) diff --git a/spras/btb.py b/spras/btb.py index 6ad3afb69..81474bdb2 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -65,7 +65,7 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, output_file, args=Empty(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide diff --git a/spras/domino.py b/spras/domino.py index 86f3c0563..16a70a788 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -76,7 +76,10 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(inputs, output_file, args=DominoParams(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): + if not args: + args = DominoParams() + # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. if not inputs["network"] or not inputs["active_genes"]: raise ValueError('Required DOMINO arguments are missing') @@ -152,7 +155,7 @@ def run(inputs, output_file, args=DominoParams(), container_framework="docker"): # Clean up DOMINO intermediate and pickle files slices_file.unlink(missing_ok=True) Path(out_dir, 'network.slices.pkl').unlink(missing_ok=True) - Path(network + '.pkl').unlink(missing_ok=True) + Path(f"{inputs['network']}.pkl").unlink(missing_ok=True) @staticmethod def parse_output(raw_pathway_file, standardized_pathway_file, params): diff --git a/spras/meo.py b/spras/meo.py index 30e81d87e..02edf07af 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -145,7 +145,7 @@ def generate_inputs(data, filename_map): # TODO add parameter validation # TODO document required arguments @staticmethod - def run(inputs, args=MEOParams(), output_file=None, container_framework="docker"): + def run(inputs, output_file=None, args=None, container_framework="docker"): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -154,6 +154,9 @@ def run(inputs, args=MEOParams(), output_file=None, container_framework="docker" Only the edge output file is retained. All other output files are deleted. """ + if not args: + args = MEOParams() + if inputs["edges"] is None or inputs["sources"] is None or inputs["targets"] is None: raise ValueError('Required Maximum Edge Orientation arguments are missing') diff --git a/spras/mincostflow.py b/spras/mincostflow.py index eab80c631..b2267f800 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -72,7 +72,10 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, args=MinCostFlowParams(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): + if not args: + args = MinCostFlowParams() + # ensures that these parameters are required if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required MinCostFlow arguments are missing') diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index d8226f735..9152e80a6 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -40,8 +40,8 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi class OmicsIntegrator1Params(BaseModel): dummy_mode: Optional[str] = None - mu_squared: Optional[bool] = None - exclude_terms: Optional[bool] = None + mu_squared: bool = False + exclude_terms: bool = False noisy_edges: Optional[int] = None "How many times you would like to add noise to the given edge values and re-run the algorithm." @@ -202,9 +202,9 @@ def run(inputs, output_file, args, container_framework="docker"): command.extend(['--dummyMode', args.dummy_mode]) # Add optional arguments - if args.mu_squared is not None and args.mu_squared: + if args.mu_squared: command.extend(['--musquared']) - if args.exclude_terms is not None and args.exclude_terms: + if args.exclude_terms: command.extend(['--excludeTerms']) if args.noisy_edges is not None: command.extend(['--noisyEdges', str(args.noisy_edges)]) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 41aec9ee1..fb420de8e 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -103,7 +103,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(inputs, output_file, args=OmicsIntegrator2Params(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. @@ -111,6 +111,9 @@ def run(inputs, output_file, args=OmicsIntegrator2Params(), container_framework= @param output_file: the name of the output file, which will overwrite any existing file with this name @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ + if not args: + args = OmicsIntegrator2Params() + if inputs["edges"] is None or inputs["prizes"] is None: raise ValueError('Required Omics Integrator 2 arguments are missing') diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 167403cef..d5ac385f4 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -16,8 +16,8 @@ __all__ = ['PathLinker', 'PathLinkerParams'] class PathLinkerParams(BaseModel): - k: Optional[int] = None - "path length (optional)" + k: int = 100 + "path length" model_config = ConfigDict(use_attribute_docstrings=True) @@ -76,7 +76,7 @@ def generate_inputs(data, filename_map): header=["#Interactor1","Interactor2","Weight"]) @staticmethod - def run(inputs, output_file, args=PathLinkerParams(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): """ Run PathLinker with Docker @param nodetypes: input node types with sources and targets (required) @@ -85,10 +85,9 @@ def run(inputs, output_file, args=PathLinkerParams(), container_framework="docke @param k: @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ - # Add additional parameter validation - # Do not require k - # Use the PathLinker default - # Could consider setting the default here instead + if not args: + args = PathLinkerParams() + if not inputs["nodetypes"] or not inputs["network"]: raise ValueError('Required PathLinker arguments are missing') @@ -118,9 +117,7 @@ def run(inputs, output_file, args=PathLinkerParams(), container_framework="docke node_file, '--output', mapped_out_prefix] - # Add optional argument - if args.k is not None: - command.extend(['-k', str(args.k)]) + command.extend(['-k', str(args.k)]) container_suffix = "pathlinker:v2" run_container_and_log('PathLinker', From 45cfe87a46a6850a3f1ce6d890b25c3b47eff781 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 21:03:20 +0000 Subject: [PATCH 42/60] fix: more defaults --- spras/domino.py | 2 +- spras/mincostflow.py | 2 +- spras/omicsintegrator1.py | 16 ++++++---------- spras/omicsintegrator2.py | 10 +++++----- spras/pathlinker.py | 8 -------- 5 files changed, 13 insertions(+), 25 deletions(-) diff --git a/spras/domino.py b/spras/domino.py index 16a70a788..110b11ab3 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -79,7 +79,7 @@ def generate_inputs(data, filename_map): def run(inputs, output_file, args=None, container_framework="docker"): if not args: args = DominoParams() - + # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. if not inputs["network"] or not inputs["active_genes"]: raise ValueError('Required DOMINO arguments are missing') diff --git a/spras/mincostflow.py b/spras/mincostflow.py index b2267f800..2673d91e2 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -75,7 +75,7 @@ def generate_inputs(data, filename_map): def run(inputs, output_file, args=None, container_framework="docker"): if not args: args = MinCostFlowParams() - + # ensures that these parameters are required if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required MinCostFlow arguments are missing') diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 9152e80a6..74f55bff7 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -43,13 +43,13 @@ class OmicsIntegrator1Params(BaseModel): mu_squared: bool = False exclude_terms: bool = False - noisy_edges: Optional[int] = None + noisy_edges: int = 0 "How many times you would like to add noise to the given edge values and re-run the algorithm." - shuffled_prizes: Optional[int] = None + shuffled_prizes: int = 0 "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run" - random_terminals: Optional[int] = None + random_terminals: int = 0 "How many times to apply the given prizes to random nodes in the interactome" seed: Optional[int] = None @@ -140,7 +140,6 @@ def generate_inputs(data, filename_map): # TODO add support for knockout argument # TODO add reasonable default values - # TODO document required arguments @staticmethod def run(inputs, output_file, args, container_framework="docker"): if inputs["edges"] is None or inputs["prizes"] is None or output_file is None: @@ -206,12 +205,9 @@ def run(inputs, output_file, args, container_framework="docker"): command.extend(['--musquared']) if args.exclude_terms: command.extend(['--excludeTerms']) - if args.noisy_edges is not None: - command.extend(['--noisyEdges', str(args.noisy_edges)]) - if args.shuffled_prizes is not None: - command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) - if args.random_terminals is not None: - command.extend(['--randomTerminals', str(args.random_terminals)]) + command.extend(['--noisyEdges', str(args.noisy_edges)]) + command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) + command.extend(['--randomTerminals', str(args.random_terminals)]) if args.seed is not None: command.extend(['--seed', str(args.seed)]) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index fb420de8e..f0a2d9c52 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -1,8 +1,9 @@ +import time from pathlib import Path from typing import Optional import pandas as pd -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -39,8 +40,8 @@ class OmicsIntegrator2Params(BaseModel): "all" = connect to all nodes in the interactome. """ - seed: Optional[int] = None - "The random seed to use for this run." + seed: int = Field(default_factory=lambda _: int(time.time() * 1000)) + "The random seed to use for this run. Defaults to the current UNIX timestamp." model_config = ConfigDict(use_attribute_docstrings=True) @@ -153,8 +154,7 @@ def run(inputs, output_file, args=None, container_framework="docker"): if args.dummy_mode is not None: # This argument does not follow the other naming conventions command.extend(['--dummyMode', str(args.dummy_mode)]) - if args.seed is not None: - command.extend(['--seed', str(args.seed)]) + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-2:v2" run_container_and_log('Omics Integrator 2', diff --git a/spras/pathlinker.py b/spras/pathlinker.py index d5ac385f4..9b6fe964c 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -77,14 +77,6 @@ def generate_inputs(data, filename_map): @staticmethod def run(inputs, output_file, args=None, container_framework="docker"): - """ - Run PathLinker with Docker - @param nodetypes: input node types with sources and targets (required) - @param network: input network file (required) - @param output_file: path to the output pathway file (required) - @param k: - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ if not args: args = PathLinkerParams() From e0808570331316b5dbfd5af5bc4d2f4702635bb8 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 22:13:13 +0000 Subject: [PATCH 43/60] feat: begin algorithm parsing --- config/config.yaml | 4 +-- spras/config/algorithms.py | 63 ++++++++++++++++++++++++++++++++++++++ spras/config/schema.py | 21 ++----------- spras/runner.py | 43 ++++++++++++++------------ 4 files changed, 91 insertions(+), 40 deletions(-) create mode 100644 spras/config/algorithms.py diff --git a/config/config.yaml b/config/config.yaml index 8092b9eb9..5d23946d4 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -81,8 +81,8 @@ algorithms: rand_restarts: 10 - name: "mincostflow" - params: - include: true + include: true + runs: run1: flow: 1 # The flow must be an int capacity: 1 diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py new file mode 100644 index 000000000..9e78788f4 --- /dev/null +++ b/spras/config/algorithms.py @@ -0,0 +1,63 @@ +""" +Dynamic construction of algoithm parameters with runtime type information for +parameter combinations. This has been isolated from schema.py as it is not declarative, +and rather mainly contains validators and lower-level pydantic code. +""" +from typing import Any, cast, Union + +from spras.runner import algorithms +from pydantic import BaseModel, create_model + +__all__ = ['AlgorithmUnion'] + +def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseModel]: + """ + Dynamically constructs a parameter-combination model based on the original args model. + This is the most 'hacky' part of this code, but, thanks to pydantic, we almost* + avoid reflection and preserve rich type information. + """ + # First, we need to take our 'model' and coerce it to permit parameter combinations. + # This assumes that all of the keys are flattened, so we only get a structure like so: + # class AlgorithmParams(BaseModel): + # key1: int + # key2: list[str] + # ... + # and we want to transform this to: + # class AlgorithmParamsCombination(BaseModel): + # key1: list[int] + # key2: list[list[str]] + # This function does not worry about getting the cartesian product of this. + + # Map our fields to a list (assuming we have no nested keys) + mapped_list_field: dict[str, type[list[Any]]] = {name: list[field.annotation] for name, field in model.model_fields.items()} + + # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields + for key in mapped_list_field.keys(): + assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema." + \ + "This should have been caught by the Snakemake CI step." + + # Pass this as kwargs to create_model, which usually takes in parameters field_name=type. + # This is the asterisk (*) from the docstring: we do need to cast create_model, since otherwise + # the type-checker complains that we may have had a key that starts with __ in mapped_list_fields. + # The above assertion prevents this. + run_model = (cast(Any, create_model))( + f'{name}RunModel', + **mapped_list_field + ) + + # Here is an example of how this would look like inside config.yaml + # name: pathlinker + # include: true + # runs: + # run1: + # (from run_model) + # ... + return create_model( + f'{name}Model', + name=name, + include=bool, + runs=dict[str, run_model] + ) + +algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model) for name, (_, model) in algorithms.items()] +AlgorithmUnion = Union[tuple(algorithm_models)] diff --git a/spras/config/schema.py b/spras/config/schema.py index 623c9dd9b..7a42673d6 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -11,13 +11,13 @@ """ import re -from typing import Annotated, Optional +from typing import Annotated from pydantic import AfterValidator, BaseModel, ConfigDict, Field +from spras.config.algorithms import AlgorithmUnion from spras.config.util import CaseInsensitiveEnum - class SummaryAnalysis(BaseModel): include: bool @@ -87,21 +87,6 @@ class ContainerRegistry(BaseModel): model_config = ConfigDict(extra='forbid') -class AlgorithmParams(BaseModel): - include: bool - directed: Optional[bool] = None - - # TODO: use array of runs instead. We currently rely on the - # extra parameters here to extract the algorithm parameter information, - # which is why this deviates from the usual ConfigDict(extra='forbid'). - model_config = ConfigDict(extra='allow') - -class Algorithm(BaseModel): - name: str - params: AlgorithmParams - - model_config = ConfigDict(extra='forbid') - class Dataset(BaseModel): label: Annotated[str, AfterValidator(label_validator("Dataset"))] node_files: list[str] @@ -139,7 +124,7 @@ class RawConfig(BaseModel): description="The length of the hash used to identify a parameter combination", default=DEFAULT_HASH_LENGTH) - algorithms: list[Algorithm] + algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this. datasets: list[Dataset] gold_standards: list[GoldStandard] = [] analysis: Analysis = Analysis() diff --git a/spras/runner.py b/spras/runner.py index a023a9606..843b3cf46 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,35 +1,38 @@ from typing import Any +from pydantic import BaseModel + # supported algorithm imports from spras.allpairs import AllPairs from spras.btb import BowTieBuilder from spras.dataset import Dataset -from spras.domino import DOMINO -from spras.meo import MEO -from spras.mincostflow import MinCostFlow -from spras.omicsintegrator1 import OmicsIntegrator1 -from spras.omicsintegrator2 import OmicsIntegrator2 -from spras.pathlinker import PathLinker +from spras.config.util import Empty +from spras.domino import DOMINO, DominoParams +from spras.meo import MEO, MEOParams +from spras.mincostflow import MinCostFlow, MinCostFlowParams +from spras.omicsintegrator1 import OmicsIntegrator1, OmicsIntegrator1Params +from spras.omicsintegrator2 import OmicsIntegrator2, OmicsIntegrator2Params +from spras.pathlinker import PathLinker, PathLinkerParams from spras.prm import PRM -from spras.rwr import RWR -from spras.strwr import ST_RWR +from spras.rwr import RWR, RWRParams +from spras.strwr import ST_RWR, ST_RWRParams -algorithms: dict[str, type[PRM]] = { - "allpairs": AllPairs, - "bowtiebuilder": BowTieBuilder, - "domino": DOMINO, - "meo": MEO, - "mincostflow": MinCostFlow, - "omicsintegrator1": OmicsIntegrator1, - "omicsintegrator2": OmicsIntegrator2, - "pathlinker": PathLinker, - "rwr": RWR, - "strwr": ST_RWR, +algorithms: dict[str, tuple[type[PRM], type[BaseModel]]] = { + "allpairs": (AllPairs, Empty), + "bowtiebuilder": (BowTieBuilder, Empty), + "domino": (DOMINO, DominoParams), + "meo": (MEO, MEOParams), + "mincostflow": (MinCostFlow, MinCostFlowParams), + "omicsintegrator1": (OmicsIntegrator1, OmicsIntegrator1Params), + "omicsintegrator2": (OmicsIntegrator2, OmicsIntegrator2Params), + "pathlinker": (PathLinker, PathLinkerParams), + "rwr": (RWR, RWRParams), + "strwr": (ST_RWR, ST_RWRParams), } def get_algorithm(algorithm: str) -> type[PRM]: try: - return algorithms[algorithm.lower()] + return algorithms[algorithm.lower()][0] except KeyError as exc: raise NotImplementedError(f'{algorithm} is not currently supported.') from exc From 53f55e27a7bc1040c77c8941746156f497c214e4 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 22:59:58 +0000 Subject: [PATCH 44/60] fix: clean up type errors, begin nondetermnism --- spras/config/algorithms.py | 13 ++++++------- spras/config/schema.py | 4 ++++ spras/config/util.py | 30 ++++++++++++++++++++++++++++-- spras/domino.py | 5 +++-- spras/omicsintegrator1.py | 7 ++++--- 5 files changed, 45 insertions(+), 14 deletions(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 9e78788f4..bc7b896fc 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -3,7 +3,7 @@ parameter combinations. This has been isolated from schema.py as it is not declarative, and rather mainly contains validators and lower-level pydantic code. """ -from typing import Any, cast, Union +from typing import Any, cast, Union, Literal from spras.runner import algorithms from pydantic import BaseModel, create_model @@ -13,8 +13,8 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseModel]: """ Dynamically constructs a parameter-combination model based on the original args model. - This is the most 'hacky' part of this code, but, thanks to pydantic, we almost* - avoid reflection and preserve rich type information. + This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection + and preserve rich type information at runtime. """ # First, we need to take our 'model' and coerce it to permit parameter combinations. # This assumes that all of the keys are flattened, so we only get a structure like so: @@ -37,9 +37,8 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod "This should have been caught by the Snakemake CI step." # Pass this as kwargs to create_model, which usually takes in parameters field_name=type. - # This is the asterisk (*) from the docstring: we do need to cast create_model, since otherwise - # the type-checker complains that we may have had a key that starts with __ in mapped_list_fields. - # The above assertion prevents this. + # We do need to cast create_model, since otherwise the type-checker complains that we may + # have had a key that starts with __ in mapped_list_fields. The above assertion prevents this. run_model = (cast(Any, create_model))( f'{name}RunModel', **mapped_list_field @@ -54,7 +53,7 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod # ... return create_model( f'{name}Model', - name=name, + name=Literal[name], include=bool, runs=dict[str, run_model] ) diff --git a/spras/config/schema.py b/spras/config/schema.py index 7a42673d6..76404b387 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -124,6 +124,7 @@ class RawConfig(BaseModel): description="The length of the hash used to identify a parameter combination", default=DEFAULT_HASH_LENGTH) + # See algorithms.py for more information about AlgorithmUnion algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this. datasets: list[Dataset] gold_standards: list[GoldStandard] = [] @@ -132,3 +133,6 @@ class RawConfig(BaseModel): reconstruction_settings: ReconstructionSettings model_config = ConfigDict(extra='forbid') + +# AlgorithmUnion is dynamically constructed. +RawConfig.model_rebuild() diff --git a/spras/config/util.py b/spras/config/util.py index 32f19076f..0ed99a26e 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -1,7 +1,14 @@ +""" +General config utilities. This is the only config file +that should be imported by algorithms, and algorithms should +only import this config file. +""" + from enum import Enum +import time from typing import Any -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field # https://stackoverflow.com/a/76883868/7589775 @@ -23,6 +30,25 @@ def _missing_(cls, value: Any): class Empty(BaseModel): """ - The empty base model. Used for specifying that an algorithm takes no parameters. + The empty base model. Used for specifying that an algorithm takes no parameters, + yet are deterministic. """ model_config = ConfigDict(extra="forbid") + +class NondeterministicModel(BaseModel): + """ + A nondeterministic model. Any seedless nondeterministic algorithm should extend this. + Internally, this inserts a _time parameter that can be serialized but not + deserialized, and will affect the hash. + """ + + # We don't make this a PrivateAttr for reasons explained in the doc comment. + time: float = Field(default_factory=time.time, alias="_time") + """ + The internal _time parameter. This is a parameter only given to nondeterminsitic + algorithms that provide no randomness seed. While this should be unset, + we allow specifying `_time` for users that want to re-use outputs of runs, + though this explicitly breaks the 'immutability' promise of runs. + """ + + model_config = ConfigDict(use_attribute_docstrings=True) diff --git a/spras/domino.py b/spras/domino.py index 30ccc8a84..a9ce7a43b 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -3,9 +3,10 @@ from typing import Optional import pandas as pd -from pydantic import BaseModel, ConfigDict +from pydantic import ConfigDict from spras.containers import prepare_volume, run_container_and_log +from spras.config.util import NondeterministicModel from spras.interactome import ( add_constant, reinsert_direction_col_undirected, @@ -18,7 +19,7 @@ ID_PREFIX = 'ENSG0' ID_PREFIX_LEN = len(ID_PREFIX) -class DominoParams(BaseModel): +class DominoParams(NondeterministicModel): module_threshold: Optional[float] = None "the p-value threshold for considering a slice as relevant (optional)" diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 74f55bff7..ddb934bb5 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,7 +1,8 @@ from pathlib import Path +import time from typing import Optional -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed @@ -52,8 +53,8 @@ class OmicsIntegrator1Params(BaseModel): random_terminals: int = 0 "How many times to apply the given prizes to random nodes in the interactome" - seed: Optional[int] = None - "the randomness seed to use" + seed: int = Field(default_factory=lambda _: int(time.time() * 1000)) + "The random seed to use for this run. Defaults to the current UNIX timestamp." w: int "the number of trees" From 2c938ed09708359ebaaf82733646868998df759f Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 16:05:36 -0700 Subject: [PATCH 45/60] fix: add spras.config to pyproject --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bedbe1628..b18ef12c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,4 +74,4 @@ select = [ # py-modules tells setuptools which directory is our actual module py-modules = ["spras"] # packages tells setuptools what the exported package is called (ie allows import spras) -packages = ["spras", "spras.analysis"] +packages = ["spras", "spras.analysis", "spras.config"] From a4e265d2a2930b3b7b2e44a2907d64be9a08f09c Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 23:15:07 +0000 Subject: [PATCH 46/60] chore: begin little utility --- config/config.yaml | 4 ++-- util/update_schema.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 util/update_schema.py diff --git a/config/config.yaml b/config/config.yaml index 5d23946d4..7bb58dcdf 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -73,8 +73,8 @@ algorithms: g: 3 - name: "meo" - params: - include: true + include: true + runs: run1: max_path_length: 3 local_search: "Yes" diff --git a/util/update_schema.py b/util/update_schema.py new file mode 100644 index 000000000..c6a7bedca --- /dev/null +++ b/util/update_schema.py @@ -0,0 +1,13 @@ +""" +Updates config/schema.json. +This should be done whenever a new algorithm is introduced, +or the config is otherwise directly changed. +""" + +import json +from pathlib import Path + +from spras.config.schema import RawConfig + +config_schema = RawConfig.model_json_schema() +Path('config/schema.json').write_text(json.dumps(config_schema, indent=2)) From 145b2ec9c16b736a7e2939cd257f69f0abc456cf Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 16:46:12 -0700 Subject: [PATCH 47/60] chore: mv container schema changes over --- config/config.yaml | 50 +++++++++++++------------ spras/config/config.py | 23 ++---------- spras/config/container_schema.py | 64 ++++++++++++++++++++++++++++++++ spras/config/schema.py | 6 +-- spras/containers.py | 20 +++++----- 5 files changed, 105 insertions(+), 58 deletions(-) create mode 100644 spras/config/container_schema.py diff --git a/config/config.yaml b/config/config.yaml index 7bb58dcdf..a834131e6 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,30 +3,32 @@ # The length of the hash used to identify a parameter combination hash_length: 7 -# Specify the container framework used by each PRM wrapper. Valid options include: -# - docker (default if not specified) -# - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed -# - dsub -- experimental with limited support, used for running on Google Cloud with the All of Us cloud environment. -# - There is no support for other environments at the moment. -container_framework: docker - -# Only used if container_framework is set to singularity, this will unpack the singularity containers -# to the local filesystem. This is useful when PRM containers need to run inside another container, -# such as would be the case in an HTCondor/OSPool environment. -# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way -# that persists after the workflow is complete. To clean up the unpacked containers, the user must -# manually delete them. For convenience, these unpacked files will exist in the current working directory -# under `unpacked`. -unpack_singularity: false - -# Allow the user to configure which container registry containers should be pulled from -# Note that this assumes container names are consistent across registries, and that the -# registry being passed doesn't require authentication for pull actions -container_registry: - base_url: docker.io - # The owner or project of the registry - # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs - owner: reedcompbio +# Collection of container options +containers: + # Specify the container framework used by each PRM wrapper. Valid options include: + # - docker (default if not specified) + # - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed + # - dsub -- experimental with limited support, used for running on Google Cloud with the All of Us cloud environment. + # - There is no support for other environments at the moment. + framework: docker + + # Only used if container_framework is set to singularity, this will unpack the singularity containers + # to the local filesystem. This is useful when PRM containers need to run inside another container, + # such as would be the case in an HTCondor/OSPool environment. + # NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way + # that persists after the workflow is complete. To clean up the unpacked containers, the user must + # manually delete them. For convenience, these unpacked files will exist in the current working directory + # under `unpacked`. + unpack_singularity: false + + # Allow the user to configure which container registry containers should be pulled from + # Note that this assumes container names are consistent across registries, and that the + # registry being passed doesn't require authentication for pull actions + registry: + base_url: docker.io + # The owner or project of the registry + # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs + owner: reedcompbio # This list of algorithms should be generated by a script which checks the filesystem for installs. # It shouldn't be changed by mere mortals. (alternatively, we could add a path to executable for each algorithm diff --git a/spras/config/config.py b/spras/config/config.py index c6ac8f8e0..6d6ee4b7e 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,13 +23,12 @@ import numpy as np import yaml -from spras.config.schema import Analysis, ContainerFramework, RawConfig +from spras.config.container_schema import ProcessedContainerOptions +from spras.config.schema import Analysis, RawConfig from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None -DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" - # This will get called in the Snakefile, instantiating the singleton with the raw config def init_global(config_dict): global config @@ -67,9 +66,7 @@ def __init__(self, raw_config: dict[str, Any]): # Directory used for storing output self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity" - self.container_framework = None - # The container prefix (host and organization) to use for images. Default is "docker.io/reedcompbio" - self.container_prefix: str = DEFAULT_CONTAINER_PREFIX + self.container_settings = ProcessedContainerOptions.from_container_settings(parsed_raw_config.containers, parsed_raw_config.hash_length) # A Boolean specifying whether to unpack singularity containers. Default is False self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run @@ -275,22 +272,8 @@ def process_analysis(self, raw_config: RawConfig): self.analysis_include_evaluation_aggregate_algo = False def process_config(self, raw_config: RawConfig): - # Set up a few top-level config variables self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir - if raw_config.container_framework == ContainerFramework.dsub: - warnings.warn("'dsub' framework integration is experimental and may not be fully supported.", stacklevel=2) - self.container_framework = raw_config.container_framework - - # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. - if raw_config.unpack_singularity and self.container_framework != "singularity": - warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.", stacklevel=2) - self.unpack_singularity = raw_config.unpack_singularity - - # Grab registry from the config, and if none is provided default to docker - if raw_config.container_registry and raw_config.container_registry.base_url != "" and raw_config.container_registry.owner != "": - self.container_prefix = raw_config.container_registry.base_url + "/" + raw_config.container_registry.owner - self.process_datasets(raw_config) self.process_algorithms(raw_config) self.process_analysis(raw_config) diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py new file mode 100644 index 000000000..d515e0de0 --- /dev/null +++ b/spras/config/container_schema.py @@ -0,0 +1,64 @@ +""" +The separate container schema specification file. +For information about pydantic, see schema.py. + +We move this to a separate file to allow `containers.py` to explicitly take in +this subsection of the configuration. +""" + +from dataclasses import dataclass +from pydantic import BaseModel, ConfigDict, Field +from typing import Optional +import warnings + +from spras.config.util import CaseInsensitiveEnum + +DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" + +class ContainerFramework(CaseInsensitiveEnum): + docker = 'docker' + # TODO: add apptainer variant once #260 gets merged + singularity = 'singularity' + dsub = 'dsub' + +class ContainerRegistry(BaseModel): + base_url: str + owner: str = Field(description="The owner or project of the registry") + + model_config = ConfigDict(extra='forbid') + +class ContainerSettings(BaseModel): + framework: ContainerFramework = ContainerFramework.docker + unpack_singularity: bool = False + registry: ContainerRegistry + hash_length: Optional[int] = None + +@dataclass +class ProcessedContainerOptions: + container_framework: ContainerFramework + unpack_singularity: bool + container_prefix: str + hash_length: int + + @staticmethod + def from_container_settings(settings: ContainerSettings, default_hash_length: int) -> "ProcessedContainerOptions": + if settings.framework == ContainerFramework.dsub: + warnings.warn("'dsub' framework integration is experimental and may not be fully supported.", stacklevel=2) + container_framework = settings.framework + + # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. + if settings.unpack_singularity and container_framework != "singularity": + warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.", stacklevel=2) + unpack_singularity = settings.unpack_singularity + + # Grab registry from the config, and if none is provided default to docker + container_prefix = DEFAULT_CONTAINER_PREFIX + if settings.registry and settings.registry.base_url != "" and settings.registry.owner != "": + container_prefix = settings.registry.base_url + "/" + settings.registry.owner + + return ProcessedContainerOptions( + container_framework=container_framework, + unpack_singularity=unpack_singularity, + container_prefix=container_prefix, + hash_length=settings.hash_length or default_hash_length + ) diff --git a/spras/config/schema.py b/spras/config/schema.py index 76404b387..7657a41a0 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -16,6 +16,7 @@ from pydantic import AfterValidator, BaseModel, ConfigDict, Field from spras.config.algorithms import AlgorithmUnion +from spras.config.container_schema import ContainerSettings from spras.config.util import CaseInsensitiveEnum class SummaryAnalysis(BaseModel): @@ -115,10 +116,7 @@ class ReconstructionSettings(BaseModel): model_config = ConfigDict(extra='forbid') class RawConfig(BaseModel): - # TODO: move these container values to a nested container key - container_framework: ContainerFramework = ContainerFramework.docker - unpack_singularity: bool = False - container_registry: ContainerRegistry + containers: ContainerSettings hash_length: int = Field( description="The length of the hash used to identify a parameter combination", diff --git a/spras/containers.py b/spras/containers.py index 314d4bb45..e41d4737e 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -8,7 +8,7 @@ import docker import docker.errors -import spras.config.config as config +from spras.config.container_schema import ProcessedContainerOptions from spras.logging import indent from spras.util import hash_filename @@ -131,7 +131,7 @@ def env_to_items(environment: dict[str, str]) -> Iterator[str]: # TODO consider a better default environment variable # Follow docker-py's naming conventions (https://docker-py.readthedocs.io/en/stable/containers.html) # Technically the argument is an image, not a container, but we use container here. -def run_container(framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: Optional[dict[str, str]] = None): +def run_container(framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity or Docker @param framework: singularity or docker @@ -144,17 +144,17 @@ def run_container(framework: str, container_suffix: str, command: List[str], vol """ normalized_framework = framework.casefold() - container = config.config.container_prefix + "/" + container_suffix + container = config.container_prefix + "/" + container_suffix if normalized_framework == 'docker': return run_container_docker(container, command, volumes, working_dir, environment) elif normalized_framework == 'singularity': - return run_container_singularity(container, command, volumes, working_dir, environment) + return run_container_singularity(container, command, volumes, working_dir, config, environment) elif normalized_framework == 'dsub': return run_container_dsub(container, command, volumes, working_dir, environment) else: raise ValueError(f'{framework} is not a recognized container framework. Choose "docker", "dsub", or "singularity".') -def run_container_and_log(name: str, framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: Optional[dict[str, str]] = None): +def run_container_and_log(name: str, framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity or Docker with associated pretty printed messages. @param name: the display name of the running container for logging purposes @@ -171,7 +171,7 @@ def run_container_and_log(name: str, framework: str, container_suffix: str, comm print('Running {} on container framework "{}" on env {} with command: {}'.format(name, framework, list(env_to_items(environment)), ' '.join(command)), flush=True) try: - out = run_container(framework=framework, container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, environment=environment) + out = run_container(framework=framework, container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, config=config, environment=environment) if out is not None: if isinstance(out, list): out = ''.join(out) @@ -290,7 +290,7 @@ def run_container_docker(container: str, command: List[str], volumes: List[Tuple return out -def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: Optional[dict[str, str]] = None): +def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity. Only available on Linux. @@ -329,7 +329,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ singularity_options.extend(['--env', ",".join(env_to_items(environment))]) # Handle unpacking singularity image if needed. Potentially needed for running nested unprivileged containers - if config.config.unpack_singularity: + if config.unpack_singularity: # Split the string by "/" path_elements = container.split("/") @@ -369,7 +369,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ # Because this is called independently for each file, the same local path can be mounted to multiple volumes -def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]: +def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath], config: ProcessedContainerOptions) -> Tuple[Tuple[PurePath, PurePath], str]: """ Makes a file on the local file system accessible within a container by mapping the local (source) path to a new container (destination) path and renaming the file to be relative to the destination path. @@ -388,7 +388,7 @@ def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, Pu if isinstance(filename, os.PathLike): filename = str(filename) - filename_hash = hash_filename(filename, config.config.hash_length) + filename_hash = hash_filename(filename, config.hash_length) dest = PurePosixPath(base_path, filename_hash) abs_filename = Path(filename).resolve() From 5effe6980aebcb57dfd61b588ab39c93dcc33cbd Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 00:00:07 +0000 Subject: [PATCH 48/60] feat: initial schema --- config/schema.json | 1153 ++++++++++++++++++++++++++++++ spras/allpairs.py | 20 +- spras/btb.py | 15 +- spras/config/config.py | 4 +- spras/config/container_schema.py | 20 +- spras/containers.py | 26 +- spras/prm.py | 5 +- 7 files changed, 1196 insertions(+), 47 deletions(-) create mode 100644 config/schema.json diff --git a/config/schema.json b/config/schema.json new file mode 100644 index 000000000..f99541d51 --- /dev/null +++ b/config/schema.json @@ -0,0 +1,1153 @@ +{ + "$defs": { + "Analysis": { + "additionalProperties": false, + "properties": { + "summary": { + "$ref": "#/$defs/SummaryAnalysis", + "default": { + "include": false + } + }, + "cytoscape": { + "$ref": "#/$defs/CytoscapeAnalysis", + "default": { + "include": false + } + }, + "ml": { + "$ref": "#/$defs/MlAnalysis", + "default": { + "include": false, + "aggregate_per_algorithm": false, + "components": 2, + "labels": true, + "linkage": "ward", + "metric": "euclidean" + } + }, + "evaluation": { + "$ref": "#/$defs/EvaluationAnalysis", + "default": { + "include": false, + "aggregate_per_algorithm": false + } + } + }, + "title": "Analysis", + "type": "object" + }, + "ContainerFramework": { + "enum": [ + "docker", + "singularity", + "dsub" + ], + "title": "ContainerFramework", + "type": "string" + }, + "ContainerRegistry": { + "additionalProperties": false, + "properties": { + "base_url": { + "title": "Base Url", + "type": "string" + }, + "owner": { + "description": "The owner or project of the registry", + "title": "Owner", + "type": "string" + } + }, + "required": [ + "base_url", + "owner" + ], + "title": "ContainerRegistry", + "type": "object" + }, + "ContainerSettings": { + "properties": { + "framework": { + "$ref": "#/$defs/ContainerFramework", + "default": "docker" + }, + "unpack_singularity": { + "default": false, + "title": "Unpack Singularity", + "type": "boolean" + }, + "registry": { + "$ref": "#/$defs/ContainerRegistry" + }, + "hash_length": { + "default": 7, + "title": "Hash Length", + "type": "integer" + } + }, + "required": [ + "registry" + ], + "title": "ContainerSettings", + "type": "object" + }, + "CytoscapeAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + } + }, + "required": [ + "include" + ], + "title": "CytoscapeAnalysis", + "type": "object" + }, + "Dataset": { + "additionalProperties": false, + "properties": { + "label": { + "title": "Label", + "type": "string" + }, + "node_files": { + "items": { + "type": "string" + }, + "title": "Node Files", + "type": "array" + }, + "edge_files": { + "items": { + "type": "string" + }, + "title": "Edge Files", + "type": "array" + }, + "other_files": { + "items": { + "type": "string" + }, + "title": "Other Files", + "type": "array" + }, + "data_dir": { + "title": "Data Dir", + "type": "string" + } + }, + "required": [ + "label", + "node_files", + "edge_files", + "other_files", + "data_dir" + ], + "title": "Dataset", + "type": "object" + }, + "EvaluationAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + }, + "aggregate_per_algorithm": { + "default": false, + "title": "Aggregate Per Algorithm", + "type": "boolean" + } + }, + "required": [ + "include" + ], + "title": "EvaluationAnalysis", + "type": "object" + }, + "GoldStandard": { + "additionalProperties": false, + "properties": { + "label": { + "title": "Label", + "type": "string" + }, + "node_files": { + "items": { + "type": "string" + }, + "title": "Node Files", + "type": "array" + }, + "data_dir": { + "title": "Data Dir", + "type": "string" + }, + "dataset_labels": { + "items": { + "type": "string" + }, + "title": "Dataset Labels", + "type": "array" + } + }, + "required": [ + "label", + "node_files", + "data_dir", + "dataset_labels" + ], + "title": "GoldStandard", + "type": "object" + }, + "Locations": { + "additionalProperties": false, + "properties": { + "reconstruction_dir": { + "title": "Reconstruction Dir", + "type": "string" + } + }, + "required": [ + "reconstruction_dir" + ], + "title": "Locations", + "type": "object" + }, + "MlAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + }, + "aggregate_per_algorithm": { + "default": false, + "title": "Aggregate Per Algorithm", + "type": "boolean" + }, + "components": { + "default": 2, + "title": "Components", + "type": "integer" + }, + "labels": { + "default": true, + "title": "Labels", + "type": "boolean" + }, + "linkage": { + "$ref": "#/$defs/MlLinkage", + "default": "ward" + }, + "metric": { + "$ref": "#/$defs/MlMetric", + "default": "euclidean" + } + }, + "required": [ + "include" + ], + "title": "MlAnalysis", + "type": "object" + }, + "MlLinkage": { + "enum": [ + "ward", + "complete", + "average", + "single" + ], + "title": "MlLinkage", + "type": "string" + }, + "MlMetric": { + "enum": [ + "euclidean", + "manhattan", + "cosine" + ], + "title": "MlMetric", + "type": "string" + }, + "ReconstructionSettings": { + "additionalProperties": false, + "properties": { + "locations": { + "$ref": "#/$defs/Locations" + } + }, + "required": [ + "locations" + ], + "title": "ReconstructionSettings", + "type": "object" + }, + "SummaryAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + } + }, + "required": [ + "include" + ], + "title": "SummaryAnalysis", + "type": "object" + }, + "allpairsModel": { + "properties": { + "name": { + "const": "allpairs", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/allpairsRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "allpairsModel", + "type": "object" + }, + "allpairsRunModel": { + "properties": {}, + "title": "allpairsRunModel", + "type": "object" + }, + "bowtiebuilderModel": { + "properties": { + "name": { + "const": "bowtiebuilder", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/bowtiebuilderRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "bowtiebuilderModel", + "type": "object" + }, + "bowtiebuilderRunModel": { + "properties": {}, + "title": "bowtiebuilderRunModel", + "type": "object" + }, + "dominoModel": { + "properties": { + "name": { + "const": "domino", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/dominoRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "dominoModel", + "type": "object" + }, + "dominoRunModel": { + "properties": { + "time": { + "items": { + "type": "number" + }, + "title": "Time", + "type": "array" + }, + "module_threshold": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Module Threshold", + "type": "array" + }, + "slice_threshold": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Slice Threshold", + "type": "array" + } + }, + "required": [ + "time", + "module_threshold", + "slice_threshold" + ], + "title": "dominoRunModel", + "type": "object" + }, + "meoModel": { + "properties": { + "name": { + "const": "meo", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/meoRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "meoModel", + "type": "object" + }, + "meoRunModel": { + "properties": { + "max_path_length": { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "title": "Max Path Length", + "type": "array" + }, + "local_search": { + "items": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "title": "Local Search", + "type": "array" + }, + "rand_restarts": { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "title": "Rand Restarts", + "type": "array" + } + }, + "required": [ + "max_path_length", + "local_search", + "rand_restarts" + ], + "title": "meoRunModel", + "type": "object" + }, + "mincostflowModel": { + "properties": { + "name": { + "const": "mincostflow", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/mincostflowRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "mincostflowModel", + "type": "object" + }, + "mincostflowRunModel": { + "properties": { + "flow": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Flow", + "type": "array" + }, + "capacity": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Capacity", + "type": "array" + } + }, + "required": [ + "flow", + "capacity" + ], + "title": "mincostflowRunModel", + "type": "object" + }, + "omicsintegrator1Model": { + "properties": { + "name": { + "const": "omicsintegrator1", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/omicsintegrator1RunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "omicsintegrator1Model", + "type": "object" + }, + "omicsintegrator1RunModel": { + "properties": { + "dummy_mode": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "title": "Dummy Mode", + "type": "array" + }, + "mu_squared": { + "items": { + "type": "boolean" + }, + "title": "Mu Squared", + "type": "array" + }, + "exclude_terms": { + "items": { + "type": "boolean" + }, + "title": "Exclude Terms", + "type": "array" + }, + "noisy_edges": { + "items": { + "type": "integer" + }, + "title": "Noisy Edges", + "type": "array" + }, + "shuffled_prizes": { + "items": { + "type": "integer" + }, + "title": "Shuffled Prizes", + "type": "array" + }, + "random_terminals": { + "items": { + "type": "integer" + }, + "title": "Random Terminals", + "type": "array" + }, + "seed": { + "items": { + "type": "integer" + }, + "title": "Seed", + "type": "array" + }, + "w": { + "items": { + "type": "integer" + }, + "title": "W", + "type": "array" + }, + "b": { + "items": { + "type": "number" + }, + "title": "B", + "type": "array" + }, + "d": { + "items": { + "type": "integer" + }, + "title": "D", + "type": "array" + }, + "mu": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Mu", + "type": "array" + }, + "noise": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Noise", + "type": "array" + }, + "g": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "G", + "type": "array" + }, + "r": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "R", + "type": "array" + } + }, + "required": [ + "dummy_mode", + "mu_squared", + "exclude_terms", + "noisy_edges", + "shuffled_prizes", + "random_terminals", + "seed", + "w", + "b", + "d", + "mu", + "noise", + "g", + "r" + ], + "title": "omicsintegrator1RunModel", + "type": "object" + }, + "omicsintegrator2Model": { + "properties": { + "name": { + "const": "omicsintegrator2", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/omicsintegrator2RunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "omicsintegrator2Model", + "type": "object" + }, + "omicsintegrator2RunModel": { + "properties": { + "w": { + "items": { + "type": "number" + }, + "title": "W", + "type": "array" + }, + "b": { + "items": { + "type": "number" + }, + "title": "B", + "type": "array" + }, + "g": { + "items": { + "type": "number" + }, + "title": "G", + "type": "array" + }, + "noise": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Noise", + "type": "array" + }, + "noisy_edges": { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "title": "Noisy Edges", + "type": "array" + }, + "random_terminals": { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "title": "Random Terminals", + "type": "array" + }, + "dummy_mode": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "title": "Dummy Mode", + "type": "array" + }, + "seed": { + "items": { + "type": "integer" + }, + "title": "Seed", + "type": "array" + } + }, + "required": [ + "w", + "b", + "g", + "noise", + "noisy_edges", + "random_terminals", + "dummy_mode", + "seed" + ], + "title": "omicsintegrator2RunModel", + "type": "object" + }, + "pathlinkerModel": { + "properties": { + "name": { + "const": "pathlinker", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/pathlinkerRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "pathlinkerModel", + "type": "object" + }, + "pathlinkerRunModel": { + "properties": { + "k": { + "items": { + "type": "integer" + }, + "title": "K", + "type": "array" + } + }, + "required": [ + "k" + ], + "title": "pathlinkerRunModel", + "type": "object" + }, + "rwrModel": { + "properties": { + "name": { + "const": "rwr", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/rwrRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "rwrModel", + "type": "object" + }, + "rwrRunModel": { + "properties": { + "threshold": { + "items": { + "type": "integer" + }, + "title": "Threshold", + "type": "array" + }, + "alpha": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Alpha", + "type": "array" + } + }, + "required": [ + "threshold", + "alpha" + ], + "title": "rwrRunModel", + "type": "object" + }, + "strwrModel": { + "properties": { + "name": { + "const": "strwr", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/strwrRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "strwrModel", + "type": "object" + }, + "strwrRunModel": { + "properties": { + "threshold": { + "items": { + "type": "integer" + }, + "title": "Threshold", + "type": "array" + }, + "alpha": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Alpha", + "type": "array" + } + }, + "required": [ + "threshold", + "alpha" + ], + "title": "strwrRunModel", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "containers": { + "$ref": "#/$defs/ContainerSettings" + }, + "hash_length": { + "default": 7, + "description": "The length of the hash used to identify a parameter combination", + "title": "Hash Length", + "type": "integer" + }, + "algorithms": { + "items": { + "anyOf": [ + { + "$ref": "#/$defs/allpairsModel" + }, + { + "$ref": "#/$defs/bowtiebuilderModel" + }, + { + "$ref": "#/$defs/dominoModel" + }, + { + "$ref": "#/$defs/meoModel" + }, + { + "$ref": "#/$defs/mincostflowModel" + }, + { + "$ref": "#/$defs/omicsintegrator1Model" + }, + { + "$ref": "#/$defs/omicsintegrator2Model" + }, + { + "$ref": "#/$defs/pathlinkerModel" + }, + { + "$ref": "#/$defs/rwrModel" + }, + { + "$ref": "#/$defs/strwrModel" + } + ] + }, + "title": "Algorithms", + "type": "array" + }, + "datasets": { + "items": { + "$ref": "#/$defs/Dataset" + }, + "title": "Datasets", + "type": "array" + }, + "gold_standards": { + "default": [], + "items": { + "$ref": "#/$defs/GoldStandard" + }, + "title": "Gold Standards", + "type": "array" + }, + "analysis": { + "$ref": "#/$defs/Analysis", + "default": { + "summary": { + "include": false + }, + "cytoscape": { + "include": false + }, + "ml": { + "aggregate_per_algorithm": false, + "components": 2, + "include": false, + "labels": true, + "linkage": "ward", + "metric": "euclidean" + }, + "evaluation": { + "aggregate_per_algorithm": false, + "include": false + } + } + }, + "reconstruction_settings": { + "$ref": "#/$defs/ReconstructionSettings" + } + }, + "required": [ + "containers", + "algorithms", + "datasets", + "reconstruction_settings" + ], + "title": "RawConfig", + "type": "object" +} \ No newline at end of file diff --git a/spras/allpairs.py b/spras/allpairs.py index 670d3f721..bba5df467 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -1,6 +1,7 @@ import warnings from pathlib import Path +from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -72,14 +73,7 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): - """ - Run All Pairs Shortest Paths with Docker - @param nodetypes: input node types with sources and targets (required) - @param network: input network file (required) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - @param output_file: path to the output pathway file (required) - """ + def run(inputs, output_file, args=None, container_settings=ProcessedContainerSettings()): if not inputs["nodetypes"] or not inputs["network"] or not inputs["directed_flag"]: raise ValueError('Required All Pairs Shortest Paths arguments are missing') @@ -88,15 +82,15 @@ def run(inputs, output_file, args=None, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(inputs["network"], work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # Create the parent directories for the output file if needed Path(output_file).parent.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_file = prepare_volume(output_file, work_dir) + bind_path, mapped_out_file = prepare_volume(output_file, work_dir, container_settings) volumes.append(bind_path) command = ['python', @@ -110,11 +104,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): container_suffix = "allpairs:v4" run_container_and_log( 'All Pairs Shortest Paths', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) @staticmethod def parse_output(raw_pathway_file, standardized_pathway_file, params): diff --git a/spras/btb.py b/spras/btb.py index 81474bdb2..7f7a1b944 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -1,5 +1,6 @@ from pathlib import Path +from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -65,7 +66,7 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): + def run(inputs, output_file, args=None, container_settings=ProcessedContainerSettings()): # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide @@ -93,19 +94,19 @@ def run(inputs, output_file, args=None, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(inputs["sources"], work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(inputs["targets"], work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(inputs["edges"], work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) # Use its --output argument to set the output file prefix to specify an absolute path and prefix out_dir = Path(output_file).parent out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + '/raw-pathway.txt' # Use posix path inside the container @@ -122,11 +123,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): container_suffix = "bowtiebuilder:v2" run_container_and_log('BowTieBuilder', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # Output is already written to raw-pathway.txt file diff --git a/spras/config/config.py b/spras/config/config.py index 6d6ee4b7e..252d6ccf5 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,7 +23,7 @@ import numpy as np import yaml -from spras.config.container_schema import ProcessedContainerOptions +from spras.config.container_schema import ProcessedContainerSettings from spras.config.schema import Analysis, RawConfig from spras.util import NpHashEncoder, hash_params_sha1_base32 @@ -66,7 +66,7 @@ def __init__(self, raw_config: dict[str, Any]): # Directory used for storing output self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity" - self.container_settings = ProcessedContainerOptions.from_container_settings(parsed_raw_config.containers, parsed_raw_config.hash_length) + self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, parsed_raw_config.hash_length) # A Boolean specifying whether to unpack singularity containers. Default is False self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py index d515e0de0..9688a9b51 100644 --- a/spras/config/container_schema.py +++ b/spras/config/container_schema.py @@ -31,17 +31,17 @@ class ContainerSettings(BaseModel): framework: ContainerFramework = ContainerFramework.docker unpack_singularity: bool = False registry: ContainerRegistry - hash_length: Optional[int] = None + hash_length: int = 7 @dataclass -class ProcessedContainerOptions: - container_framework: ContainerFramework - unpack_singularity: bool - container_prefix: str - hash_length: int +class ProcessedContainerSettings: + framework: ContainerFramework = ContainerFramework.docker + unpack_singularity: bool = False + prefix: str = DEFAULT_CONTAINER_PREFIX + hash_length: int = 7 @staticmethod - def from_container_settings(settings: ContainerSettings, default_hash_length: int) -> "ProcessedContainerOptions": + def from_container_settings(settings: ContainerSettings, default_hash_length: int) -> "ProcessedContainerSettings": if settings.framework == ContainerFramework.dsub: warnings.warn("'dsub' framework integration is experimental and may not be fully supported.", stacklevel=2) container_framework = settings.framework @@ -56,9 +56,9 @@ def from_container_settings(settings: ContainerSettings, default_hash_length: in if settings.registry and settings.registry.base_url != "" and settings.registry.owner != "": container_prefix = settings.registry.base_url + "/" + settings.registry.owner - return ProcessedContainerOptions( - container_framework=container_framework, + return ProcessedContainerSettings( + framework=container_framework, unpack_singularity=unpack_singularity, - container_prefix=container_prefix, + prefix=container_prefix, hash_length=settings.hash_length or default_hash_length ) diff --git a/spras/containers.py b/spras/containers.py index e41d4737e..d065b2ea8 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -8,7 +8,7 @@ import docker import docker.errors -from spras.config.container_schema import ProcessedContainerOptions +from spras.config.container_schema import ProcessedContainerSettings from spras.logging import indent from spras.util import hash_filename @@ -131,47 +131,47 @@ def env_to_items(environment: dict[str, str]) -> Iterator[str]: # TODO consider a better default environment variable # Follow docker-py's naming conventions (https://docker-py.readthedocs.io/en/stable/containers.html) # Technically the argument is an image, not a container, but we use container here. -def run_container(framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): +def run_container(container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, container_settings: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity or Docker - @param framework: singularity or docker @param container_suffix: name of the DockerHub container without the 'docker://' prefix @param command: command to run in the container @param volumes: a list of volumes to mount where each item is a (source, destination) tuple @param working_dir: the working directory in the container + @param container_settings: the settings to use to run the container @param environment: environment variables to set in the container @return: output from Singularity execute or Docker run """ - normalized_framework = framework.casefold() + normalized_framework = container_settings.framework.casefold() - container = config.container_prefix + "/" + container_suffix + container = container_settings.prefix + "/" + container_suffix if normalized_framework == 'docker': return run_container_docker(container, command, volumes, working_dir, environment) elif normalized_framework == 'singularity': - return run_container_singularity(container, command, volumes, working_dir, config, environment) + return run_container_singularity(container, command, volumes, working_dir, container_settings, environment) elif normalized_framework == 'dsub': return run_container_dsub(container, command, volumes, working_dir, environment) else: - raise ValueError(f'{framework} is not a recognized container framework. Choose "docker", "dsub", or "singularity".') + raise ValueError(f'{container_settings.framework} is not a recognized container framework. Choose "docker", "dsub", or "singularity".') -def run_container_and_log(name: str, framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): +def run_container_and_log(name: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, container_settings: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity or Docker with associated pretty printed messages. @param name: the display name of the running container for logging purposes - @param framework: singularity or docker @param container_suffix: name of the DockerHub container without the 'docker://' prefix @param command: command to run in the container @param volumes: a list of volumes to mount where each item is a (source, destination) tuple @param working_dir: the working directory in the container + @param container_settings: the container settings to use @param environment: environment variables to set in the container @return: output from Singularity execute or Docker run """ if not environment: environment = {'SPRAS': 'True'} - print('Running {} on container framework "{}" on env {} with command: {}'.format(name, framework, list(env_to_items(environment)), ' '.join(command)), flush=True) + print('Running {} on container framework "{}" on env {} with command: {}'.format(name, container_settings.framework, list(env_to_items(environment)), ' '.join(command)), flush=True) try: - out = run_container(framework=framework, container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, config=config, environment=environment) + out = run_container(container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, container_settings=container_settings, environment=environment) if out is not None: if isinstance(out, list): out = ''.join(out) @@ -290,7 +290,7 @@ def run_container_docker(container: str, command: List[str], volumes: List[Tuple return out -def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): +def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity. Only available on Linux. @@ -369,7 +369,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ # Because this is called independently for each file, the same local path can be mounted to multiple volumes -def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath], config: ProcessedContainerOptions) -> Tuple[Tuple[PurePath, PurePath], str]: +def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath], config: ProcessedContainerSettings) -> Tuple[Tuple[PurePath, PurePath], str]: """ Makes a file on the local file system accessible within a container by mapping the local (source) path to a new container (destination) path and renaming the file to be relative to the destination path. diff --git a/spras/prm.py b/spras/prm.py index 73c94454a..d52214083 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -4,6 +4,7 @@ from pydantic import BaseModel +from spras.config.container_schema import ProcessedContainerSettings from spras.dataset import Dataset T = TypeVar('T', bound=BaseModel) @@ -42,10 +43,10 @@ def generate_inputs(data: Dataset, filename_map: dict[str, str]): @staticmethod @abstractmethod - def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_framework="docker"): + def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_settings: ProcessedContainerSettings): """ Runs an algorithm with the specified inputs, algorithm params (T), - the designated output_file, and the desired container_framework. + the designated output_file, and the desired container_settings. """ raise NotImplementedError From 398350e68a3f1776e829c5f7f4823560cd73f7b8 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 15 Jul 2025 08:22:58 -0700 Subject: [PATCH 49/60] feat: more algs schema handling --- config/config.yaml | 30 +- config/schema.json | 1017 ++++++++++++++++++++++++------------ spras/config/algorithms.py | 57 +- 3 files changed, 756 insertions(+), 348 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index a834131e6..3e2127d53 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,3 +1,5 @@ +# yaml-language-server: $schema=./schema.json + # Global workflow control # The length of the hash used to identify a parameter combination @@ -50,14 +52,14 @@ containers: algorithms: - name: "pathlinker" - params: - include: true + include: true + runs: run1: k: range(100,201,100) - name: "omicsintegrator1" - params: - include: true + include: true + runs: run1: b: [5, 6] w: np.linspace(0,5,2) @@ -65,8 +67,8 @@ algorithms: dummy_mode: "file" # Or "terminals", "all", "others" - name: "omicsintegrator2" - params: - include: true + include: true + runs: run1: b: 4 g: 0 @@ -79,7 +81,7 @@ algorithms: runs: run1: max_path_length: 3 - local_search: "Yes" + local_search: true rand_restarts: 10 - name: "mincostflow" @@ -90,8 +92,7 @@ algorithms: capacity: 1 - name: "allpairs" - params: - include: true + include: true - name: "domino" params: @@ -101,22 +102,21 @@ algorithms: module_threshold: 0.05 - name: "strwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "rwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "bowtiebuilder" - params: - include: true + include: true # Here we specify which pathways to run and other file location information. # DataLoader.py can currently only load a single dataset diff --git a/config/schema.json b/config/schema.json index f99541d51..01494a4ea 100644 --- a/config/schema.json +++ b/config/schema.json @@ -393,47 +393,77 @@ }, "dominoRunModel": { "properties": { - "time": { - "items": { - "type": "number" - }, - "title": "Time", - "type": "array" + "_time": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The internal _time parameter. This is a parameter only given to nondeterminsitic\nalgorithms that provide no randomness seed. While this should be unset,\nwe allow specifying `_time` for users that want to re-use outputs of runs,\nthough this explicitly breaks the 'immutability' promise of runs.", + "title": "Time" }, "module_threshold": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Module Threshold", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "the p-value threshold for considering a slice as relevant (optional)", + "title": "Module Threshold" }, "slice_threshold": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Slice Threshold", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "the p-value threshold for considering a putative module as final module (optional)", + "title": "Slice Threshold" } }, - "required": [ - "time", - "module_threshold", - "slice_threshold" - ], "title": "dominoRunModel", "type": "object" }, @@ -467,53 +497,84 @@ "meoRunModel": { "properties": { "max_path_length": { - "items": { - "anyOf": [ - { - "type": "integer" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Max Path Length", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "the maximal length of a path from sources and targets to orient.", + "title": "Max Path Length" }, "local_search": { - "items": { - "anyOf": [ - { - "type": "boolean" + "anyOf": [ + { + "type": "boolean" + }, + { + "items": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Local Search", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "a boolean parameter that enables MEO's local search functionality.\nSee \"Improving approximations with local search\" in the associated paper\nfor more information.", + "title": "Local Search" }, "rand_restarts": { - "items": { - "anyOf": [ - { - "type": "integer" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Rand Restarts", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The number of random restarts to do.", + "title": "Rand Restarts" } }, - "required": [ - "max_path_length", - "local_search", - "rand_restarts" - ], "title": "meoRunModel", "type": "object" }, @@ -547,38 +608,58 @@ "mincostflowRunModel": { "properties": { "flow": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Flow", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "amount of flow going through the graph", + "title": "Flow" }, "capacity": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Capacity", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "amount of capacity allowed on each edge", + "title": "Capacity" } }, - "required": [ - "flow", - "capacity" - ], "title": "mincostflowRunModel", "type": "object" }, @@ -612,154 +693,298 @@ "omicsintegrator1RunModel": { "properties": { "dummy_mode": { - "items": { - "anyOf": [ - { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Dummy Mode", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Dummy Mode" }, "mu_squared": { - "items": { - "type": "boolean" - }, - "title": "Mu Squared", - "type": "array" + "anyOf": [ + { + "type": "boolean" + }, + { + "items": { + "type": "boolean" + }, + "type": "array" + } + ], + "default": false, + "title": "Mu Squared" }, "exclude_terms": { - "items": { - "type": "boolean" - }, - "title": "Exclude Terms", - "type": "array" + "anyOf": [ + { + "type": "boolean" + }, + { + "items": { + "type": "boolean" + }, + "type": "array" + } + ], + "default": false, + "title": "Exclude Terms" }, "noisy_edges": { - "items": { - "type": "integer" - }, - "title": "Noisy Edges", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 0, + "description": "How many times you would like to add noise to the given edge values and re-run the algorithm.", + "title": "Noisy Edges" }, "shuffled_prizes": { - "items": { - "type": "integer" - }, - "title": "Shuffled Prizes", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 0, + "description": "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run", + "title": "Shuffled Prizes" }, "random_terminals": { - "items": { - "type": "integer" - }, - "title": "Random Terminals", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 0, + "description": "How many times to apply the given prizes to random nodes in the interactome", + "title": "Random Terminals" }, "seed": { - "items": { - "type": "integer" - }, - "title": "Seed", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The random seed to use for this run. Defaults to the current UNIX timestamp.", + "title": "Seed" }, "w": { - "items": { - "type": "integer" - }, - "title": "W", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "the number of trees", + "title": "W" }, "b": { - "items": { - "type": "number" - }, - "title": "B", - "type": "array" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "the trade-off between including more terminals and using less reliable edges", + "title": "B" }, "d": { - "items": { - "type": "integer" - }, - "title": "D", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "controls the maximum path-length from v0 to terminal nodes", + "title": "D" }, "mu": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Mu", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "controls the degree-based negative prizes (defualt 0.0)", + "title": "Mu" }, "noise": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Noise", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations", + "title": "Noise" }, "g": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "G", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "(Gamma) multiplicative edge penalty from degree of endpoints", + "title": "G" }, "r": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "R", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)", + "title": "R" } }, "required": [ - "dummy_mode", - "mu_squared", - "exclude_terms", - "noisy_edges", - "shuffled_prizes", - "random_terminals", - "seed", "w", "b", - "d", - "mu", - "noise", - "g", - "r" + "d" ], "title": "omicsintegrator1RunModel", "type": "object" @@ -794,100 +1019,185 @@ "omicsintegrator2RunModel": { "properties": { "w": { - "items": { - "type": "number" - }, - "title": "W", - "type": "array" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 6, + "description": "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode", + "title": "W" }, "b": { - "items": { - "type": "number" - }, - "title": "B", - "type": "array" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 1, + "description": "Beta: scaling factor of prizes", + "title": "B" }, "g": { - "items": { - "type": "number" - }, - "title": "G", - "type": "array" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 20, + "description": "Gamma: multiplicative edge penalty from degree of endpoints", + "title": "G" }, "noise": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Noise", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations.", + "title": "Noise" }, "noisy_edges": { - "items": { - "anyOf": [ - { - "type": "integer" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Noisy Edges", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "An integer specifying how many times to add noise to the given edge values and re-run.", + "title": "Noisy Edges" }, "random_terminals": { - "items": { - "anyOf": [ - { - "type": "integer" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Random Terminals", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run", + "title": "Random Terminals" }, "dummy_mode": { - "items": { - "anyOf": [ - { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Dummy Mode", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals)\n \"terminals\" = connect to all terminals\n \"others\" = connect to all nodes except for terminals\n \"all\" = connect to all nodes in the interactome.", + "title": "Dummy Mode" }, "seed": { - "items": { - "type": "integer" - }, - "title": "Seed", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The random seed to use for this run. Defaults to the current UNIX timestamp.", + "title": "Seed" } }, - "required": [ - "w", - "b", - "g", - "noise", - "noisy_edges", - "random_terminals", - "dummy_mode", - "seed" - ], "title": "omicsintegrator2RunModel", "type": "object" }, @@ -921,16 +1231,25 @@ "pathlinkerRunModel": { "properties": { "k": { - "items": { - "type": "integer" - }, - "title": "K", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 100, + "description": "path length", + "title": "K" } }, - "required": [ - "k" - ], "title": "pathlinkerRunModel", "type": "object" }, @@ -964,30 +1283,52 @@ "rwrRunModel": { "properties": { "threshold": { - "items": { - "type": "integer" - }, - "title": "Threshold", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The number of nodes to return", + "title": "Threshold" }, "alpha": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Alpha", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The chance of a restart during the random walk", + "title": "Alpha" } }, "required": [ - "threshold", - "alpha" + "threshold" ], "title": "rwrRunModel", "type": "object" @@ -1022,30 +1363,52 @@ "strwrRunModel": { "properties": { "threshold": { - "items": { - "type": "integer" - }, - "title": "Threshold", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The number of nodes to return", + "title": "Threshold" }, "alpha": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Alpha", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The chance of a restart during the random walk", + "title": "Alpha" } }, "required": [ - "threshold", - "alpha" + "threshold" ], "title": "strwrRunModel", "type": "object" diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index bc7b896fc..bf0f13750 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -3,13 +3,40 @@ parameter combinations. This has been isolated from schema.py as it is not declarative, and rather mainly contains validators and lower-level pydantic code. """ -from typing import Any, cast, Union, Literal +from typing import Annotated, Any, Callable, cast, Union, Literal from spras.runner import algorithms -from pydantic import BaseModel, create_model +from pydantic import BaseModel, BeforeValidator, create_model __all__ = ['AlgorithmUnion'] +def is_numpy_friendly(type: type[Any] | None) -> bool: + """ + Whether the passed in type can have any numpy helpers. + This is mainly used to provide hints in the JSON schema. + """ + return type in (int, float) + +def python_evalish_coerce(type: type[Any] | None) -> Callable[[Any], Any]: + """ + Allows for using numpy and python calls + """ + + def numpy_coerce_validator(value: Any) -> Any: + raise NotImplementedError + + return numpy_coerce_validator + + +def list_coerce(value: Any) -> Any: + """ + Coerces to a value to a list if it isn't already. + Used as a BeforeValidator. + """ + if not isinstance(value, list): + return [value] + return value + def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseModel]: """ Dynamically constructs a parameter-combination model based on the original args model. @@ -26,14 +53,32 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod # class AlgorithmParamsCombination(BaseModel): # key1: list[int] # key2: list[list[str]] - # This function does not worry about getting the cartesian product of this. + # However, we want to preserve certain conveniences (singleton values, fake python evaluation), + # so we also make use of BeforeValidators to do so, and we pass over their preferences into the JSON schema. + # (Note: This function does not worry about getting the cartesian product of this.) - # Map our fields to a list (assuming we have no nested keys) - mapped_list_field: dict[str, type[list[Any]]] = {name: list[field.annotation] for name, field in model.model_fields.items()} + # Map our fields to a list (assuming we have no nested keys), + # and specify our user convenience validators + mapped_list_field: dict[str, Annotated] = { + name: (Annotated[ + list[field.annotation], + # This order isn't arbitrary. + # https://docs.pydantic.dev/latest/concepts/validators/#ordering-of-validators + # This runs second. This coerces any singletons to lists. + BeforeValidator(list_coerce), + # This runs first. This evaluates numpy utils for integer/float lists + BeforeValidator( + python_evalish_coerce(field.annotation), + # json_schema_input_type (sensibly) overwrites, so we only specify it here. + json_schema_input_type=Union[field.annotation, list[field.annotation], str] if is_numpy_friendly(field.annotation) else \ + Union[field.annotation, list[field.annotation]] + ) + ], field) for name, field in model.model_fields.items() + } # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields for key in mapped_list_field.keys(): - assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema." + \ + assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema. " + \ "This should have been caught by the Snakemake CI step." # Pass this as kwargs to create_model, which usually takes in parameters field_name=type. From 72c4cbd4dbbf9cedf607010f659e5811226ab830 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 15:56:13 +0000 Subject: [PATCH 50/60] feat: default runs for default algorithms --- config/config.yaml | 1 + config/schema.json | 64 +++++++++++++++++++++++++++++--------- spras/config/algorithms.py | 14 ++++++--- spras/runner.py | 26 +++++++++------- 4 files changed, 75 insertions(+), 30 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 3e2127d53..49ae31f4f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -95,6 +95,7 @@ algorithms: include: true - name: "domino" + include: true params: include: true run1: diff --git a/config/schema.json b/config/schema.json index 01494a4ea..649b815c6 100644 --- a/config/schema.json +++ b/config/schema.json @@ -315,14 +315,16 @@ "additionalProperties": { "$ref": "#/$defs/allpairsRunModel" }, + "default": { + "default": {} + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "allpairsModel", "type": "object" @@ -347,14 +349,16 @@ "additionalProperties": { "$ref": "#/$defs/bowtiebuilderRunModel" }, + "default": { + "default": {} + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "bowtiebuilderModel", "type": "object" @@ -379,14 +383,20 @@ "additionalProperties": { "$ref": "#/$defs/dominoRunModel" }, + "default": { + "default": { + "_time": 1752594898.608572, + "module_threshold": null, + "slice_threshold": null + } + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "dominoModel", "type": "object" @@ -482,14 +492,20 @@ "additionalProperties": { "$ref": "#/$defs/meoRunModel" }, + "default": { + "default": { + "max_path_length": null, + "local_search": null, + "rand_restarts": null + } + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "meoModel", "type": "object" @@ -593,14 +609,19 @@ "additionalProperties": { "$ref": "#/$defs/mincostflowRunModel" }, + "default": { + "default": { + "flow": null, + "capacity": null + } + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "mincostflowModel", "type": "object" @@ -1004,14 +1025,25 @@ "additionalProperties": { "$ref": "#/$defs/omicsintegrator2RunModel" }, + "default": { + "default": { + "w": 6.0, + "b": 1.0, + "g": 20.0, + "noise": null, + "noisy_edges": null, + "random_terminals": null, + "dummy_mode": null, + "seed": 1752594898608 + } + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "omicsintegrator2Model", "type": "object" @@ -1216,14 +1248,18 @@ "additionalProperties": { "$ref": "#/$defs/pathlinkerRunModel" }, + "default": { + "default": { + "k": 100 + } + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "pathlinkerModel", "type": "object" diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index bf0f13750..f129594db 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -3,7 +3,7 @@ parameter combinations. This has been isolated from schema.py as it is not declarative, and rather mainly contains validators and lower-level pydantic code. """ -from typing import Annotated, Any, Callable, cast, Union, Literal +from typing import Annotated, Any, Callable, cast, Optional, Union, Literal from spras.runner import algorithms from pydantic import BaseModel, BeforeValidator, create_model @@ -37,7 +37,7 @@ def list_coerce(value: Any) -> Any: return [value] return value -def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseModel]: +def construct_algorithm_model(name: str, model: type[BaseModel], model_default: Optional[BaseModel]) -> type[BaseModel]: """ Dynamically constructs a parameter-combination model based on the original args model. This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection @@ -100,8 +100,14 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod f'{name}Model', name=Literal[name], include=bool, - runs=dict[str, run_model] + # For algorithms that have a default parameter config, we allow arbitrarily running an algorithm + # if no runs are specified. For example, the following config + # name: pathlinker + # include: true + # will run, despite there being no entries in `runs`. + # (create_model entries take in either a type or (type, default)). + runs=dict[str, run_model] if model_default is None else (dict[str, run_model], {"default": model_default}) ) -algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model) for name, (_, model) in algorithms.items()] +algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model, model_default) for name, (_, model, model_default) in algorithms.items()] AlgorithmUnion = Union[tuple(algorithm_models)] diff --git a/spras/runner.py b/spras/runner.py index 843b3cf46..4f603f9b9 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Optional from pydantic import BaseModel @@ -17,17 +17,19 @@ from spras.rwr import RWR, RWRParams from spras.strwr import ST_RWR, ST_RWRParams -algorithms: dict[str, tuple[type[PRM], type[BaseModel]]] = { - "allpairs": (AllPairs, Empty), - "bowtiebuilder": (BowTieBuilder, Empty), - "domino": (DOMINO, DominoParams), - "meo": (MEO, MEOParams), - "mincostflow": (MinCostFlow, MinCostFlowParams), - "omicsintegrator1": (OmicsIntegrator1, OmicsIntegrator1Params), - "omicsintegrator2": (OmicsIntegrator2, OmicsIntegrator2Params), - "pathlinker": (PathLinker, PathLinkerParams), - "rwr": (RWR, RWRParams), - "strwr": (ST_RWR, ST_RWRParams), +# Algorithm names to a three-tuple of (PRM, BaseModel, default BaseModel or None if there are no good defaults). +# This is used for the configuration and to fetch algorithms during reconstruction +algorithms: dict[str, tuple[type[PRM], type[BaseModel], Optional[BaseModel]]] = { + "allpairs": (AllPairs, Empty, Empty()), + "bowtiebuilder": (BowTieBuilder, Empty, Empty()), + "domino": (DOMINO, DominoParams, DominoParams()), + "meo": (MEO, MEOParams, MEOParams()), + "mincostflow": (MinCostFlow, MinCostFlowParams, MinCostFlowParams()), + "omicsintegrator1": (OmicsIntegrator1, OmicsIntegrator1Params, None), + "omicsintegrator2": (OmicsIntegrator2, OmicsIntegrator2Params, OmicsIntegrator2Params()), + "pathlinker": (PathLinker, PathLinkerParams, PathLinkerParams()), + "rwr": (RWR, RWRParams, None), + "strwr": (ST_RWR, ST_RWRParams, None), } def get_algorithm(algorithm: str) -> type[PRM]: From 2ef26727221583cc7ef6c613e651d15a25c6b0e8 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 17:11:33 +0000 Subject: [PATCH 51/60] feat: function running --- config/schema.json | 4 +-- spras/config/algorithms.py | 55 ++++++++++++++++++++++++++++++-------- spras/omicsintegrator1.py | 3 +-- util/play.py | 5 ++++ 4 files changed, 52 insertions(+), 15 deletions(-) create mode 100644 util/play.py diff --git a/config/schema.json b/config/schema.json index 649b815c6..be41b5b3d 100644 --- a/config/schema.json +++ b/config/schema.json @@ -385,7 +385,7 @@ }, "default": { "default": { - "_time": 1752594898.608572, + "_time": 1752596079.9888437, "module_threshold": null, "slice_threshold": null } @@ -1034,7 +1034,7 @@ "noisy_edges": null, "random_terminals": null, "dummy_mode": null, - "seed": 1752594898608 + "seed": 1752596079988 } }, "title": "Runs", diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index f129594db..b5c42199d 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -3,8 +3,10 @@ parameter combinations. This has been isolated from schema.py as it is not declarative, and rather mainly contains validators and lower-level pydantic code. """ +import ast from typing import Annotated, Any, Callable, cast, Optional, Union, Literal +import numpy as np from spras.runner import algorithms from pydantic import BaseModel, BeforeValidator, create_model @@ -17,16 +19,48 @@ def is_numpy_friendly(type: type[Any] | None) -> bool: """ return type in (int, float) -def python_evalish_coerce(type: type[Any] | None) -> Callable[[Any], Any]: +def python_evalish_coerce(value: Any) -> Any: """ - Allows for using numpy and python calls + Allows for using numpy and python calls. + + **Safety Note**: This does not prevent availability attacks: this can still exhaust + resources if wanted. This only prevents secret leakage. """ + + if not isinstance(value, str): + return value + + # These strings are in the form of function calls `function.name(param1, param2, ...)`. + # Since we want to avoid `eval` (since this might be running in the secret-sensitive HTCondor), + # we need to parse these functions. + functions_dict: dict[str, Callable[[list[Any]], list[Union[int, float]]]] = { + 'range': lambda params: list(range(*params)), + "np.linspace": lambda params: list(np.linspace(*params)), + "np.arange": lambda params: list(np.arange(*params)), + "np.logspace": lambda params: list(np.logspace(*params)), + } + + # To do this, we get the AST of our string as an expression + value_ast = ast.parse(value, mode='eval') + + # Then we do some light parsing - we're only looking to do some literal evaluation + # (e.g. allowing 1+1) and some basic function parsing. Full python programs + # should just generate a config.yaml. + + # This should always be an Expression whose body is Call (a function). + if not isinstance(value_ast.body, ast.Call): + raise ValueError(f'The python code "{value}" should be calling a function directly. Is this meant to be python code?') - def numpy_coerce_validator(value: Any) -> Any: - raise NotImplementedError + # We get the function name back as a string + function_name = ast.unparse(value_ast.body.func) - return numpy_coerce_validator + # and we use the (non-availability) safe `ast.literal_eval` to support light expressions. + arguments = [ast.literal_eval(arg) for arg in value_ast.body.args] + if function_name not in functions_dict: + raise ValueError(f"{function_name} is not an allowed function to be run!") + + return functions_dict[function_name](arguments) def list_coerce(value: Any) -> Any: """ @@ -65,14 +99,13 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: # This order isn't arbitrary. # https://docs.pydantic.dev/latest/concepts/validators/#ordering-of-validators # This runs second. This coerces any singletons to lists. - BeforeValidator(list_coerce), + BeforeValidator(list_coerce, json_schema_input_type=Union[field.annotation, list[field.annotation]]), # This runs first. This evaluates numpy utils for integer/float lists BeforeValidator( - python_evalish_coerce(field.annotation), - # json_schema_input_type (sensibly) overwrites, so we only specify it here. - json_schema_input_type=Union[field.annotation, list[field.annotation], str] if is_numpy_friendly(field.annotation) else \ - Union[field.annotation, list[field.annotation]] - ) + python_evalish_coerce, + # json_schema_input_type (sensibly) overwrites, so we have to specify the entire union again here. + json_schema_input_type=Union[field.annotation, list[field.annotation], str] + ) if is_numpy_friendly(field.annotation) else None ], field) for name, field in model.model_fields.items() } diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index ddb934bb5..013eced8d 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -209,8 +209,7 @@ def run(inputs, output_file, args, container_framework="docker"): command.extend(['--noisyEdges', str(args.noisy_edges)]) command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) command.extend(['--randomTerminals', str(args.random_terminals)]) - if args.seed is not None: - command.extend(['--seed', str(args.seed)]) + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-1:no-conda" # no-conda version is the default run_container_and_log('Omics Integrator 1', diff --git a/util/play.py b/util/play.py new file mode 100644 index 000000000..f53ae9f53 --- /dev/null +++ b/util/play.py @@ -0,0 +1,5 @@ +import ast +value_ast = ast.parse("np.range.test(1, 2, 3)", mode='eval') +# print(ast.dump(value_ast.body, indent=2)) +assert isinstance(value_ast.body, ast.Call) +print([ast.literal_eval(arg) for arg in value_ast.body.args]) \ No newline at end of file From 9442b6496251823456670092f12b404dba19c76c Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 17:13:45 +0000 Subject: [PATCH 52/60] chore: drop play --- util/play.py | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 util/play.py diff --git a/util/play.py b/util/play.py deleted file mode 100644 index f53ae9f53..000000000 --- a/util/play.py +++ /dev/null @@ -1,5 +0,0 @@ -import ast -value_ast = ast.parse("np.range.test(1, 2, 3)", mode='eval') -# print(ast.dump(value_ast.body, indent=2)) -assert isinstance(value_ast.body, ast.Call) -print([ast.literal_eval(arg) for arg in value_ast.body.args]) \ No newline at end of file From 60b562f45a89a61d863a4bc6422cf87ce9fe81b7 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 17:21:36 +0000 Subject: [PATCH 53/60] fix(config): don't try to parse in config.py --- spras/config/config.py | 42 +++++++----------------------------------- 1 file changed, 7 insertions(+), 35 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 252d6ccf5..72f08f330 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -164,47 +164,19 @@ def process_algorithms(self, raw_config: RawConfig): # Do not parse the rest of the parameters for this algorithm if it is not included continue - if cur_params.directed is not None: - warnings.warn("UPDATE: we no longer use the directed key in the config file", stacklevel=2) - - cur_params = cur_params.__pydantic_extra__ - if cur_params is None: - raise RuntimeError("An internal error occured: ConfigDict extra should be set on AlgorithmParams.") - - # The algorithm has no named arguments so create a default placeholder - if len(cur_params.keys()) == 0: - cur_params["run1"] = {"spras_placeholder": ["no parameters"]} + runs: dict[str, Any] = cur_params.runs # Each set of runs should be 1 level down in the config file - for run_params in cur_params: + for run_name in runs.keys(): all_runs = [] # We create the product of all param combinations for each run param_name_list = [] - if cur_params[run_params]: - for p in cur_params[run_params]: - param_name_list.append(p) - obj = str(cur_params[run_params][p]) - try: - obj = [int(obj)] - except ValueError: - try: - obj = [float(obj)] - except ValueError: - # Handles arrays and special evaluation types - # TODO: do we want to explicitly bar `eval` if we may use untrusted user inputs later? - if obj.startswith(("range", "np.linspace", "np.arange", "np.logspace", "[")): - obj = eval(obj) - elif obj.lower() == "true": - obj = [True] - elif obj.lower() == "false": - obj = [False] - else: - # Catch-all for strings - obj = [obj] - if not isinstance(obj, Iterable): - raise ValueError(f"The object `{obj}` in algorithm {alg.name} at key '{p}' in run '{run_params}' is not iterable!") from None - all_runs.append(obj) + for param in runs[run_name]: + param_name_list.append(param) + # this is guaranteed to be list[Any] by algorithms.py + param_values: list[Any] = runs[run_name][param] + all_runs.append(param_values) run_list_tuples = list(it.product(*all_runs)) param_name_tuple = tuple(param_name_list) for r in run_list_tuples: From c1947e67409b90cd07a2f632335302f8f6422554 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 18:32:30 +0000 Subject: [PATCH 54/60] fix: subscriptability --- spras/config/algorithms.py | 17 ++++-- spras/config/config.py | 10 ++-- spras/config/schema.py | 3 - spras/omicsintegrator2.py | 8 ++- test/test_config.py | 109 ++++++++++++++++--------------------- 5 files changed, 70 insertions(+), 77 deletions(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index b5c42199d..fbc7a2230 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -4,11 +4,11 @@ and rather mainly contains validators and lower-level pydantic code. """ import ast -from typing import Annotated, Any, Callable, cast, Optional, Union, Literal +from typing import Annotated, Any, Callable, cast, get_args, Optional, Union, Literal import numpy as np from spras.runner import algorithms -from pydantic import BaseModel, BeforeValidator, create_model +from pydantic import BaseModel, BeforeValidator, create_model, Field __all__ = ['AlgorithmUnion'] @@ -17,7 +17,11 @@ def is_numpy_friendly(type: type[Any] | None) -> bool: Whether the passed in type can have any numpy helpers. This is mainly used to provide hints in the JSON schema. """ - return type in (int, float) + allowed_types = (int, float) + + # check basic types, then check optional types + return type in allowed_types or \ + any([arg for arg in get_args(type) if arg in allowed_types]) def python_evalish_coerce(value: Any) -> Any: """ @@ -41,10 +45,10 @@ def python_evalish_coerce(value: Any) -> Any: } # To do this, we get the AST of our string as an expression - value_ast = ast.parse(value, mode='eval') + value_ast = ast.parse(value, mode='eval', filename='config.yaml') # Then we do some light parsing - we're only looking to do some literal evaluation - # (e.g. allowing 1+1) and some basic function parsing. Full python programs + # (allowing light python notation) and some basic function parsing. Full python programs # should just generate a config.yaml. # This should always be an Expression whose body is Call (a function). @@ -143,4 +147,5 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: ) algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model, model_default) for name, (_, model, model_default) in algorithms.items()] -AlgorithmUnion = Union[tuple(algorithm_models)] +# name differentriates algorithms +AlgorithmUnion = Annotated[Union[tuple(algorithm_models)], Field(discriminator='name')] diff --git a/spras/config/config.py b/spras/config/config.py index 72f08f330..6eeb760a7 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -156,15 +156,14 @@ def process_algorithms(self, raw_config: RawConfig): self.algorithm_directed = dict() self.algorithms = raw_config.algorithms for alg in self.algorithms: - cur_params = alg.params - if cur_params.include: + if alg.include: # This dict maps from parameter combinations hashes to parameter combination dictionaries self.algorithm_params[alg.name] = dict() else: # Do not parse the rest of the parameters for this algorithm if it is not included continue - runs: dict[str, Any] = cur_params.runs + runs: dict[str, Any] = alg.runs # Each set of runs should be 1 level down in the config file for run_name in runs.keys(): @@ -172,10 +171,11 @@ def process_algorithms(self, raw_config: RawConfig): # We create the product of all param combinations for each run param_name_list = [] - for param in runs[run_name]: + run_subscriptable = vars(runs[run_name]) + for param in run_subscriptable: param_name_list.append(param) # this is guaranteed to be list[Any] by algorithms.py - param_values: list[Any] = runs[run_name][param] + param_values: list[Any] = run_subscriptable[param] all_runs.append(param_values) run_list_tuples = list(it.product(*all_runs)) param_name_tuple = tuple(param_name_list) diff --git a/spras/config/schema.py b/spras/config/schema.py index 7657a41a0..fc502b677 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -131,6 +131,3 @@ class RawConfig(BaseModel): reconstruction_settings: ReconstructionSettings model_config = ConfigDict(extra='forbid') - -# AlgorithmUnion is dynamically constructed. -RawConfig.model_rebuild() diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 5e8e73ef0..d92ba77d2 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -5,6 +5,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict, Field +from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_undirected @@ -13,6 +14,11 @@ __all__ = ['OmicsIntegrator2', 'OmicsIntegrator2Params'] +class DummyMode(CaseInsensitiveEnum): + terminals = 'terminals' + others = 'others' + all = 'all' + class OmicsIntegrator2Params(BaseModel): w: float = 6 "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode" @@ -32,7 +38,7 @@ class OmicsIntegrator2Params(BaseModel): random_terminals: Optional[int] = None "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run" - dummy_mode: Optional[str] = None + dummy_mode: Optional[DummyMode] = None """ Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals) "terminals" = connect to all terminals diff --git a/test/test_config.py b/test/test_config.py index 6095ad145..b0031d029 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -5,6 +5,7 @@ import spras.config.config as config from spras.config.schema import DEFAULT_HASH_LENGTH +from spras.config.container_schema import DEFAULT_CONTAINER_PREFIX filler_dataset_data: dict[str, str | list[str]] = { "data_dir": "fake", @@ -18,10 +19,12 @@ # individual values of the dict can be changed and the whole initialization can be re-run. def get_test_config(): test_raw_config = { - "container_framework": "singularity", - "container_registry": { - "base_url": "docker.io", - "owner": "reedcompbio", + "containers": { + "framework": "singularity", + "registry": { + "base_url": "docker.io", + "owner": "reedcompbio", + }, }, "hash_length": 7, "reconstruction_settings": { @@ -49,55 +52,37 @@ def get_test_config(): "data_dir": "gs-fake" }], "algorithms": [ + # Since there is algorithm validation, + # we are (mostly) forced to use real algorithm parameters here. + # To make this more readable, we make the 'test names' the run names. + # TODO: we don't have a test for combinations of strings anymore. This seems to be fine, + # but it would be nice to have once we introduce an algorithm that takes more than 1 string parameter. { - "name": "strings", - "params": { - "include": True, - "run1": {"test": "str1", "test2": ["str2", "str3"]} - } - }, - { - "name": "numbersAndBools", - "params": { - "include": True, - "run1": {"a": 1, "b": [float(2.0), 3], "c": [4], "d": float(5.6), "f": False} - } - }, - { - "name": "singleton_int64_with_array", - "params": { - "include": True, - "run1": {"test": np.int64(1), "test2": [2, 3]} + "name": "omicsintegrator2", + "include": True, + "runs": { + "strings": {"dummyMode": ["terminals", "others"], "b": 1}, + # spacing in np.linspace is on purpose + "singleton_string_np_linspace": {"dummyMode": "terminals", "b": "np.linspace(0, 5,2)"}, + "str_array_np_logspace": {"test": ["others", "all"], "g": "np.logspace(1,1)"} } }, { - "name": "singleton_string_np_linspace", - "params": { - "include": True, - "run1": {"test": "str1", "test2": "np.linspace(0,5,2)"} + "name": "meo", + "include": True, + "runs": { + "numbersAndBool": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": True}, + "numbersAndBools": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, + "boolArrTest": {"local_search": [True, False], "max_path_length": "range(1, 3)"} } }, { - "name": "str_array_np_logspace", - "params": { - "include": True, - "run1": {"test": ["a", "b"], "test2": "np.logspace(1,1)"} + "name": "mincostflow", + "include": True, + "runs": { + "int64artifact": {"flow": "np.arange(5,6)", "capacity": [2, 3]} } }, - { - "name": "int64artifact", - "params": { - "include": True, - "run1": {"test": "np.arange(5,6)", "test2": [2, 3]} - } - }, - { - "name": "boolArrTest", - "params": { - "include": True, - "run1": {"flags": [True, False], "range": "range(1, 3)"} - } - } ], "analysis": { "summary": { @@ -159,46 +144,46 @@ def test_config_container_framework_normalization(self): # Test singularity test_config = get_test_config() - test_config["container_framework"] = "singularity" + test_config["containers"]["framework"] = "singularity" config.init_global(test_config) - assert (config.config.container_framework == "singularity") + assert (config.config.container_settings.framework == "singularity") # Test singularity with capitalization - test_config["container_framework"] = "Singularity" + test_config["containers"]["framework"] = "Singularity" config.init_global(test_config) - assert (config.config.container_framework == "singularity") + assert (config.config.container_settings.framework == "singularity") # Test docker - test_config["container_framework"] = "docker" + test_config["containers"]["framework"] = "docker" config.init_global(test_config) - assert (config.config.container_framework == "docker") + assert (config.config.container_settings.framework == "docker") # Test docker with capitalization - test_config["container_framework"] = "Docker" + test_config["containers"]["framework"] = "Docker" config.init_global(test_config) - assert (config.config.container_framework == "docker") + assert (config.config.container_settings.framework == "docker") # Test unknown framework - test_config["container_framework"] = "badFramework" + test_config["containers"]["framework"] = "badFramework" with pytest.raises(ValueError): config.init_global(test_config) def test_config_container_registry(self): test_config = get_test_config() - test_config["container_registry"]["base_url"] = "docker.io" - test_config["container_registry"]["owner"] = "reedcompbio" + test_config["containers"]["registry"]["base_url"] = "docker.io" + test_config["containers"]["registry"]["owner"] = "reedcompbio" config.init_global(test_config) - assert (config.config.container_prefix == "docker.io/reedcompbio") + assert (config.config.container_settings.prefix == "docker.io/reedcompbio") - test_config["container_registry"]["base_url"] = "another.repo" - test_config["container_registry"]["owner"] = "different-owner" + test_config["containers"]["registry"]["base_url"] = "another.repo" + test_config["containers"]["registry"]["owner"] = "different-owner" config.init_global(test_config) - assert (config.config.container_prefix == "another.repo/different-owner") + assert (config.config.container_settings.prefix == "another.repo/different-owner") - test_config["container_registry"]["base_url"] = "" - test_config["container_registry"]["owner"] = "" + test_config["containers"]["registry"]["base_url"] = "" + test_config["containers"]["registry"]["owner"] = "" config.init_global(test_config) - assert (config.config.container_prefix == config.DEFAULT_CONTAINER_PREFIX) + assert (config.config.container_settings.prefix == DEFAULT_CONTAINER_PREFIX) def test_error_dataset_label(self): test_config = get_test_config() From 8beaf72a7e7f64c42cc543a0f31b77fdf99485e3 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 20:03:48 +0000 Subject: [PATCH 55/60] fix: auto-discriminator mapping & forbid --- config/egfr.yaml | 71 ++++++++------------ config/schema.json | 107 ++++++++++++++++++++++++++++--- spras/config/algorithms.py | 6 +- spras/config/container_schema.py | 11 +++- spras/domino.py | 2 +- spras/meo.py | 2 +- spras/mincostflow.py | 2 +- spras/omicsintegrator1.py | 2 +- spras/omicsintegrator2.py | 2 +- spras/pathlinker.py | 2 +- spras/rwr.py | 2 +- spras/strwr.py | 2 +- 12 files changed, 143 insertions(+), 68 deletions(-) diff --git a/config/egfr.yaml b/config/egfr.yaml index 9b4ccc45b..106963c62 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -1,41 +1,25 @@ -# The length of the hash used to identify a parameter combination -hash_length: 7 - -# Specify the container framework used by each PRM wrapper. Valid options include: -# - docker (default if not specified) -# - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed -# - dsub -- experimental with limited support, used for running on Google Cloud -container_framework: docker +# yaml-language-server: $schema=./schema.json -# Only used if container_framework is set to singularity, this will unpack the singularity containers -# to the local filesystem. This is useful when PRM containers need to run inside another container, -# such as would be the case in an HTCondor/OSPool environment. -# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way -# that persists after the workflow is complete. To clean up the unpacked containers, the user must -# manually delete them. -unpack_singularity: false - -# Allow the user to configure which container registry containers should be pulled from -# Note that this assumes container names are consistent across registries, and that the -# registry being passed doesn't require authentication for pull actions -container_registry: - base_url: docker.io - # The owner or project of the registry - # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs - owner: reedcompbio +hash_length: 7 +containers: + framework: docker + unpack_singularity: false + registry: + base_url: docker.io + owner: reedcompbio algorithms: - name: pathlinker - params: - include: true + include: true + runs: run1: k: - 10 - 20 - 70 - name: omicsintegrator1 - params: - include: true + include: true + runs: run1: b: - 0.55 @@ -53,8 +37,8 @@ algorithms: - 0.008 dummy_mode: ["file"] - name: omicsintegrator2 - params: - include: true + include: true + runs: run1: b: - 4 @@ -66,36 +50,31 @@ algorithms: g: - 3 - name: meo - params: - include: true + include: true + runs: run1: - local_search: - - "Yes" + local_search: true max_path_length: - 3 rand_restarts: - 10 run2: - local_search: - - "No" - max_path_length: - - 2 - rand_restarts: - - 10 + local_search: false + max_path_length: 2 + rand_restarts: 10 - name: allpairs - params: - include: true + include: true - name: domino - params: - include: true + include: true + runs: run1: slice_threshold: - 0.3 module_threshold: - 0.05 - name: mincostflow - params: - include: true + include: true + runs: run1: capacity: - 15 diff --git a/config/schema.json b/config/schema.json index be41b5b3d..c15dcaa8a 100644 --- a/config/schema.json +++ b/config/schema.json @@ -50,23 +50,23 @@ "additionalProperties": false, "properties": { "base_url": { + "default": "docker.io", + "description": "The domain of the registry", "title": "Base Url", "type": "string" }, "owner": { + "default": "reedcompbio", "description": "The owner or project of the registry", "title": "Owner", "type": "string" } }, - "required": [ - "base_url", - "owner" - ], "title": "ContainerRegistry", "type": "object" }, "ContainerSettings": { + "additionalProperties": false, "properties": { "framework": { "$ref": "#/$defs/ContainerFramework", @@ -149,6 +149,15 @@ "title": "Dataset", "type": "object" }, + "DummyMode": { + "enum": [ + "terminals", + "others", + "all" + ], + "title": "DummyMode", + "type": "string" + }, "EvaluationAnalysis": { "additionalProperties": false, "properties": { @@ -301,6 +310,7 @@ "type": "object" }, "allpairsModel": { + "additionalProperties": false, "properties": { "name": { "const": "allpairs", @@ -330,11 +340,13 @@ "type": "object" }, "allpairsRunModel": { + "additionalProperties": false, "properties": {}, "title": "allpairsRunModel", "type": "object" }, "bowtiebuilderModel": { + "additionalProperties": false, "properties": { "name": { "const": "bowtiebuilder", @@ -364,11 +376,13 @@ "type": "object" }, "bowtiebuilderRunModel": { + "additionalProperties": false, "properties": {}, "title": "bowtiebuilderRunModel", "type": "object" }, "dominoModel": { + "additionalProperties": false, "properties": { "name": { "const": "domino", @@ -385,7 +399,7 @@ }, "default": { "default": { - "_time": 1752596079.9888437, + "_time": 1752606304.38952, "module_threshold": null, "slice_threshold": null } @@ -402,6 +416,7 @@ "type": "object" }, "dominoRunModel": { + "additionalProperties": false, "properties": { "_time": { "anyOf": [ @@ -439,6 +454,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -465,6 +483,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -478,6 +499,7 @@ "type": "object" }, "meoModel": { + "additionalProperties": false, "properties": { "name": { "const": "meo", @@ -511,6 +533,7 @@ "type": "object" }, "meoRunModel": { + "additionalProperties": false, "properties": { "max_path_length": { "anyOf": [ @@ -530,6 +553,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -582,6 +608,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -595,6 +624,7 @@ "type": "object" }, "mincostflowModel": { + "additionalProperties": false, "properties": { "name": { "const": "mincostflow", @@ -627,6 +657,7 @@ "type": "object" }, "mincostflowRunModel": { + "additionalProperties": false, "properties": { "flow": { "anyOf": [ @@ -646,6 +677,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -672,6 +706,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -685,6 +722,7 @@ "type": "object" }, "omicsintegrator1Model": { + "additionalProperties": false, "properties": { "name": { "const": "omicsintegrator1", @@ -712,6 +750,7 @@ "type": "object" }, "omicsintegrator1RunModel": { + "additionalProperties": false, "properties": { "dummy_mode": { "anyOf": [ @@ -915,6 +954,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -941,6 +983,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -967,6 +1012,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -993,6 +1041,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1011,6 +1062,7 @@ "type": "object" }, "omicsintegrator2Model": { + "additionalProperties": false, "properties": { "name": { "const": "omicsintegrator2", @@ -1034,7 +1086,7 @@ "noisy_edges": null, "random_terminals": null, "dummy_mode": null, - "seed": 1752596079988 + "seed": 1752606304389 } }, "title": "Runs", @@ -1049,6 +1101,7 @@ "type": "object" }, "omicsintegrator2RunModel": { + "additionalProperties": false, "properties": { "w": { "anyOf": [ @@ -1125,6 +1178,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1151,6 +1207,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1177,6 +1236,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1188,13 +1250,13 @@ "dummy_mode": { "anyOf": [ { - "type": "string" + "$ref": "#/$defs/DummyMode" }, { "items": { "anyOf": [ { - "type": "string" + "$ref": "#/$defs/DummyMode" }, { "type": "null" @@ -1234,6 +1296,7 @@ "type": "object" }, "pathlinkerModel": { + "additionalProperties": false, "properties": { "name": { "const": "pathlinker", @@ -1265,6 +1328,7 @@ "type": "object" }, "pathlinkerRunModel": { + "additionalProperties": false, "properties": { "k": { "anyOf": [ @@ -1290,6 +1354,7 @@ "type": "object" }, "rwrModel": { + "additionalProperties": false, "properties": { "name": { "const": "rwr", @@ -1317,6 +1382,7 @@ "type": "object" }, "rwrRunModel": { + "additionalProperties": false, "properties": { "threshold": { "anyOf": [ @@ -1354,6 +1420,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1370,6 +1439,7 @@ "type": "object" }, "strwrModel": { + "additionalProperties": false, "properties": { "name": { "const": "strwr", @@ -1397,6 +1467,7 @@ "type": "object" }, "strwrRunModel": { + "additionalProperties": false, "properties": { "threshold": { "anyOf": [ @@ -1434,6 +1505,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1463,7 +1537,22 @@ }, "algorithms": { "items": { - "anyOf": [ + "discriminator": { + "mapping": { + "allpairs": "#/$defs/allpairsModel", + "bowtiebuilder": "#/$defs/bowtiebuilderModel", + "domino": "#/$defs/dominoModel", + "meo": "#/$defs/meoModel", + "mincostflow": "#/$defs/mincostflowModel", + "omicsintegrator1": "#/$defs/omicsintegrator1Model", + "omicsintegrator2": "#/$defs/omicsintegrator2Model", + "pathlinker": "#/$defs/pathlinkerModel", + "rwr": "#/$defs/rwrModel", + "strwr": "#/$defs/strwrModel" + }, + "propertyName": "name" + }, + "oneOf": [ { "$ref": "#/$defs/allpairsModel" }, diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index fbc7a2230..32f6b82d3 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -8,7 +8,7 @@ import numpy as np from spras.runner import algorithms -from pydantic import BaseModel, BeforeValidator, create_model, Field +from pydantic import BaseModel, BeforeValidator, create_model, ConfigDict, Field __all__ = ['AlgorithmUnion'] @@ -123,6 +123,7 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: # have had a key that starts with __ in mapped_list_fields. The above assertion prevents this. run_model = (cast(Any, create_model))( f'{name}RunModel', + __config__=ConfigDict(extra='forbid'), **mapped_list_field ) @@ -143,7 +144,8 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: # include: true # will run, despite there being no entries in `runs`. # (create_model entries take in either a type or (type, default)). - runs=dict[str, run_model] if model_default is None else (dict[str, run_model], {"default": model_default}) + runs=dict[str, run_model] if model_default is None else (dict[str, run_model], {"default": model_default}), + __config__=ConfigDict(extra='forbid') ) algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model, model_default) for name, (_, model, model_default) in algorithms.items()] diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py index 9688a9b51..ea9881a30 100644 --- a/spras/config/container_schema.py +++ b/spras/config/container_schema.py @@ -22,10 +22,13 @@ class ContainerFramework(CaseInsensitiveEnum): dsub = 'dsub' class ContainerRegistry(BaseModel): - base_url: str - owner: str = Field(description="The owner or project of the registry") + base_url: str = "docker.io" + "The domain of the registry" - model_config = ConfigDict(extra='forbid') + owner: str = "reedcompbio" + "The owner or project of the registry" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) class ContainerSettings(BaseModel): framework: ContainerFramework = ContainerFramework.docker @@ -33,6 +36,8 @@ class ContainerSettings(BaseModel): registry: ContainerRegistry hash_length: int = 7 + model_config = ConfigDict(extra='forbid') + @dataclass class ProcessedContainerSettings: framework: ContainerFramework = ContainerFramework.docker diff --git a/spras/domino.py b/spras/domino.py index a9ce7a43b..521f89722 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -26,7 +26,7 @@ class DominoParams(NondeterministicModel): slice_threshold: Optional[float] = None "the p-value threshold for considering a putative module as final module (optional)" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ DOMINO will construct a fully undirected graph from the provided input file diff --git a/spras/meo.py b/spras/meo.py index 02edf07af..4b3f9299e 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -83,7 +83,7 @@ class MEOParams(BaseModel): rand_restarts: Optional[int] = None "The number of random restarts to do." - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ MEO can support partially directed graphs diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 2673d91e2..1f7ff0cf7 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -20,7 +20,7 @@ class MinCostFlowParams(BaseModel): capacity: Optional[float] = None "amount of capacity allowed on each edge" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ MinCostFlow deals with fully directed graphs diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 013eced8d..1f33c25f7 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -77,7 +77,7 @@ class OmicsIntegrator1Params(BaseModel): r: Optional[float] = None "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) class OmicsIntegrator1(PRM[OmicsIntegrator1Params]): """ diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index d92ba77d2..aef4f3c48 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -49,7 +49,7 @@ class OmicsIntegrator2Params(BaseModel): seed: int = Field(default_factory=lambda _: int(time.time() * 1000)) "The random seed to use for this run. Defaults to the current UNIX timestamp." - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ Omics Integrator 2 will construct a fully undirected graph from the provided input file diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 9b6fe964c..da0a91ba2 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -19,7 +19,7 @@ class PathLinkerParams(BaseModel): k: int = 100 "path length" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ Pathlinker will construct a fully directed graph from the provided input file diff --git a/spras/rwr.py b/spras/rwr.py index ba78589ec..dff5bdb97 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -19,7 +19,7 @@ class RWRParams(BaseModel): alpha: Optional[float] = None "The chance of a restart during the random walk" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) class RWR(PRM[RWRParams]): required_inputs = ['network','nodes'] diff --git a/spras/strwr.py b/spras/strwr.py index 37590e7c6..1b9159eff 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -19,7 +19,7 @@ class ST_RWRParams(BaseModel): alpha: Optional[float] = None "The chance of a restart during the random walk" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) # Note: This class is almost identical to the rwr.py file. class ST_RWR(PRM[ST_RWRParams]): From b07a7ef0f1eba21609f0eb87bffc603a4199723c Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 20:04:32 +0000 Subject: [PATCH 56/60] style: fmt --- spras/config/algorithms.py | 13 +++++++------ spras/config/container_schema.py | 7 ++++--- spras/config/schema.py | 1 + spras/config/util.py | 2 +- spras/domino.py | 2 +- spras/omicsintegrator1.py | 2 +- spras/runner.py | 2 +- test/test_config.py | 2 +- 8 files changed, 17 insertions(+), 14 deletions(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 32f6b82d3..8c49c2ae2 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -4,11 +4,12 @@ and rather mainly contains validators and lower-level pydantic code. """ import ast -from typing import Annotated, Any, Callable, cast, get_args, Optional, Union, Literal +from typing import Annotated, Any, Callable, Literal, Optional, Union, cast, get_args import numpy as np +from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, create_model + from spras.runner import algorithms -from pydantic import BaseModel, BeforeValidator, create_model, ConfigDict, Field __all__ = ['AlgorithmUnion'] @@ -33,7 +34,7 @@ def python_evalish_coerce(value: Any) -> Any: if not isinstance(value, str): return value - + # These strings are in the form of function calls `function.name(param1, param2, ...)`. # Since we want to avoid `eval` (since this might be running in the secret-sensitive HTCondor), # we need to parse these functions. @@ -54,7 +55,7 @@ def python_evalish_coerce(value: Any) -> Any: # This should always be an Expression whose body is Call (a function). if not isinstance(value_ast.body, ast.Call): raise ValueError(f'The python code "{value}" should be calling a function directly. Is this meant to be python code?') - + # We get the function name back as a string function_name = ast.unparse(value_ast.body.func) @@ -63,7 +64,7 @@ def python_evalish_coerce(value: Any) -> Any: if function_name not in functions_dict: raise ValueError(f"{function_name} is not an allowed function to be run!") - + return functions_dict[function_name](arguments) def list_coerce(value: Any) -> Any: @@ -126,7 +127,7 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: __config__=ConfigDict(extra='forbid'), **mapped_list_field ) - + # Here is an example of how this would look like inside config.yaml # name: pathlinker # include: true diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py index ea9881a30..c88692678 100644 --- a/spras/config/container_schema.py +++ b/spras/config/container_schema.py @@ -6,10 +6,11 @@ this subsection of the configuration. """ +import warnings from dataclasses import dataclass -from pydantic import BaseModel, ConfigDict, Field from typing import Optional -import warnings + +from pydantic import BaseModel, ConfigDict, Field from spras.config.util import CaseInsensitiveEnum @@ -60,7 +61,7 @@ def from_container_settings(settings: ContainerSettings, default_hash_length: in container_prefix = DEFAULT_CONTAINER_PREFIX if settings.registry and settings.registry.base_url != "" and settings.registry.owner != "": container_prefix = settings.registry.base_url + "/" + settings.registry.owner - + return ProcessedContainerSettings( framework=container_framework, unpack_singularity=unpack_singularity, diff --git a/spras/config/schema.py b/spras/config/schema.py index fc502b677..b2ff0b3bd 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -19,6 +19,7 @@ from spras.config.container_schema import ContainerSettings from spras.config.util import CaseInsensitiveEnum + class SummaryAnalysis(BaseModel): include: bool diff --git a/spras/config/util.py b/spras/config/util.py index 0ed99a26e..63799e478 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -4,8 +4,8 @@ only import this config file. """ -from enum import Enum import time +from enum import Enum from typing import Any from pydantic import BaseModel, ConfigDict, Field diff --git a/spras/domino.py b/spras/domino.py index 521f89722..a45a445a2 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -5,8 +5,8 @@ import pandas as pd from pydantic import ConfigDict -from spras.containers import prepare_volume, run_container_and_log from spras.config.util import NondeterministicModel +from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( add_constant, reinsert_direction_col_undirected, diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 1f33c25f7..d9ee603fb 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,5 +1,5 @@ -from pathlib import Path import time +from pathlib import Path from typing import Optional from pydantic import BaseModel, ConfigDict, Field diff --git a/spras/runner.py b/spras/runner.py index 4f603f9b9..209a32f42 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -5,8 +5,8 @@ # supported algorithm imports from spras.allpairs import AllPairs from spras.btb import BowTieBuilder -from spras.dataset import Dataset from spras.config.util import Empty +from spras.dataset import Dataset from spras.domino import DOMINO, DominoParams from spras.meo import MEO, MEOParams from spras.mincostflow import MinCostFlow, MinCostFlowParams diff --git a/test/test_config.py b/test/test_config.py index b0031d029..71842c2e1 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -4,8 +4,8 @@ import pytest import spras.config.config as config -from spras.config.schema import DEFAULT_HASH_LENGTH from spras.config.container_schema import DEFAULT_CONTAINER_PREFIX +from spras.config.schema import DEFAULT_HASH_LENGTH filler_dataset_data: dict[str, str | list[str]] = { "data_dir": "fake", From 0bcd1d15ae03e5cfb4b1a0398d64585b713bb7b5 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 20:29:01 +0000 Subject: [PATCH 57/60] fix: coerce fields to validate default --- spras/config/algorithms.py | 16 +++++++++++----- test/test_config.py | 6 +++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 8c49c2ae2..c65ddae8a 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -4,6 +4,7 @@ and rather mainly contains validators and lower-level pydantic code. """ import ast +import copy from typing import Annotated, Any, Callable, Literal, Optional, Union, cast, get_args import numpy as np @@ -98,8 +99,14 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: # Map our fields to a list (assuming we have no nested keys), # and specify our user convenience validators - mapped_list_field: dict[str, Annotated] = { - name: (Annotated[ + mapped_list_field: dict[str, Annotated] = dict() + for field_name, field in model.model_fields.items(): + # We need to create a copy of the field, + # as we need to make sure that it gets mapped to the list coerced version of the field. + new_field = copy.deepcopy(field) + new_field.validate_default = True + + mapped_list_field[field_name] = (Annotated[ list[field.annotation], # This order isn't arbitrary. # https://docs.pydantic.dev/latest/concepts/validators/#ordering-of-validators @@ -111,9 +118,8 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: # json_schema_input_type (sensibly) overwrites, so we have to specify the entire union again here. json_schema_input_type=Union[field.annotation, list[field.annotation], str] ) if is_numpy_friendly(field.annotation) else None - ], field) for name, field in model.model_fields.items() - } - + ], new_field) + # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields for key in mapped_list_field.keys(): assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema. " + \ diff --git a/test/test_config.py b/test/test_config.py index 71842c2e1..e38272f94 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -61,10 +61,10 @@ def get_test_config(): "name": "omicsintegrator2", "include": True, "runs": { - "strings": {"dummyMode": ["terminals", "others"], "b": 1}, + "strings": {"dummy_mode": ["terminals", "others"], "b": 1}, # spacing in np.linspace is on purpose - "singleton_string_np_linspace": {"dummyMode": "terminals", "b": "np.linspace(0, 5,2)"}, - "str_array_np_logspace": {"test": ["others", "all"], "g": "np.logspace(1,1)"} + "singleton_string_np_linspace": {"dummy_mode": "terminals", "b": "np.linspace(0, 5,2)"}, + "str_array_np_logspace": {"dummy_mode": ["others", "all"], "g": "np.logspace(1,1)"} } }, { From 1cb5d179a876517f15224aa18aba7e7e719cc9de Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 21:55:57 +0000 Subject: [PATCH 58/60] fix: test --- config/config.yaml | 3 +- config/egfr.yaml | 57 ++++++------------ config/schema.json | 4 +- spras/config/algorithms.py | 7 ++- spras/config/config.py | 7 ++- test/test_config.py | 120 +++++++++++++++++++++++++++++-------- 6 files changed, 126 insertions(+), 72 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 49ae31f4f..30b438390 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -96,8 +96,7 @@ algorithms: - name: "domino" include: true - params: - include: true + runs: run1: slice_threshold: 0.3 module_threshold: 0.05 diff --git a/config/egfr.yaml b/config/egfr.yaml index 106963c62..363d213a1 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -25,39 +25,28 @@ algorithms: - 0.55 - 2 - 10 - d: - - 10 - g: - - 1e-3 - r: - - 0.01 - w: - - 0.1 - mu: - - 0.008 + d: 10 + g: 1e-3 + r: 0.01 + w: 0.1 + mu: 0.008 dummy_mode: ["file"] - name: omicsintegrator2 include: true runs: run1: - b: - - 4 - g: - - 0 + b: 4 + g: 0 run2: - b: - - 2 - g: - - 3 + b: 2 + g: 3 - name: meo include: true runs: run1: local_search: true - max_path_length: - - 3 - rand_restarts: - - 10 + max_path_length: 3 + rand_restarts: 10 run2: local_search: false max_path_length: 2 @@ -68,28 +57,20 @@ algorithms: include: true runs: run1: - slice_threshold: - - 0.3 - module_threshold: - - 0.05 + slice_threshold: 0.3 + module_threshold: 0.05 - name: mincostflow include: true runs: run1: - capacity: - - 15 - flow: - - 80 + capacity: 15 + flow: 80 run2: - capacity: - - 1 - flow: - - 6 + capacity: 1 + flow: 6 run3: - capacity: - - 5 - flow: - - 60 + capacity: 5 + flow: 60 datasets: - data_dir: input edge_files: diff --git a/config/schema.json b/config/schema.json index c15dcaa8a..494736275 100644 --- a/config/schema.json +++ b/config/schema.json @@ -399,7 +399,7 @@ }, "default": { "default": { - "_time": 1752606304.38952, + "_time": 1752611437.804319, "module_threshold": null, "slice_threshold": null } @@ -1086,7 +1086,7 @@ "noisy_edges": null, "random_terminals": null, "dummy_mode": null, - "seed": 1752606304389 + "seed": 1752611437804 } }, "title": "Runs", diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index c65ddae8a..889efab35 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -47,7 +47,8 @@ def python_evalish_coerce(value: Any) -> Any: } # To do this, we get the AST of our string as an expression - value_ast = ast.parse(value, mode='eval', filename='config.yaml') + # (filename='' is to make the error message more closely resemble that of eval.) + value_ast = ast.parse(value, mode='eval', filename='') # Then we do some light parsing - we're only looking to do some literal evaluation # (allowing light python notation) and some basic function parsing. Full python programs @@ -60,7 +61,7 @@ def python_evalish_coerce(value: Any) -> Any: # We get the function name back as a string function_name = ast.unparse(value_ast.body.func) - # and we use the (non-availability) safe `ast.literal_eval` to support light expressions. + # and we use the (non-availability) safe `ast.literal_eval` to support literals passed into functions. arguments = [ast.literal_eval(arg) for arg in value_ast.body.args] if function_name not in functions_dict: @@ -119,7 +120,7 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: json_schema_input_type=Union[field.annotation, list[field.annotation], str] ) if is_numpy_friendly(field.annotation) else None ], new_field) - + # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields for key in mapped_list_field.keys(): assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema. " + \ diff --git a/spras/config/config.py b/spras/config/config.py index 6eeb760a7..2c0499fb7 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -79,7 +79,7 @@ def __init__(self, raw_config: dict[str, Any]): self.algorithms = None # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. # Only includes algorithms that are set to be run with 'include: true'. - self.algorithm_params = None + self.algorithm_params: dict[str, dict[str, Any]] = dict() # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs. self.algorithm_directed = None # A dict with the analysis settings @@ -196,6 +196,11 @@ def process_algorithms(self, raw_config: RawConfig): if params_hash in prior_params_hashes: raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file ' f'(current length {self.hash_length}).') + + # We preserve the run name as it carries useful information for the parameter log, + # and is useful for testing. + run_dict["_spras_run_name"] = run_name + self.algorithm_params[alg.name][params_hash] = run_dict def process_analysis(self, raw_config: RawConfig): diff --git a/test/test_config.py b/test/test_config.py index e38272f94..3d8d67d78 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -1,11 +1,17 @@ +import copy import pickle +from typing import Iterable import numpy as np import pytest +from pydantic import BaseModel import spras.config.config as config from spras.config.container_schema import DEFAULT_CONTAINER_PREFIX from spras.config.schema import DEFAULT_HASH_LENGTH +from spras.meo import MEOParams +from spras.mincostflow import MinCostFlowParams +from spras.omicsintegrator2 import DummyMode, OmicsIntegrator2Params filler_dataset_data: dict[str, str | list[str]] = { "data_dir": "fake", @@ -61,9 +67,9 @@ def get_test_config(): "name": "omicsintegrator2", "include": True, "runs": { - "strings": {"dummy_mode": ["terminals", "others"], "b": 1}, + "strings": {"dummy_mode": ["terminals", "others"], "b": 3}, # spacing in np.linspace is on purpose - "singleton_string_np_linspace": {"dummy_mode": "terminals", "b": "np.linspace(0, 5,2)"}, + "singleton_string_np_linspace": {"dummy_mode": "terminals", "b": "np.linspace(0, 5,2,)"}, "str_array_np_logspace": {"dummy_mode": ["others", "all"], "g": "np.logspace(1,1)"} } }, @@ -71,7 +77,8 @@ def get_test_config(): "name": "meo", "include": True, "runs": { - "numbersAndBool": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": True}, + "numbersAndBoolsDuplicate": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, + "numbersAndBool": {"max_path_length": 2, "rand_restarts": [float(2.0), 3], "local_search": [True]}, "numbersAndBools": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, "boolArrTest": {"local_search": [True, False], "max_path_length": "range(1, 3)"} } @@ -80,7 +87,7 @@ def get_test_config(): "name": "mincostflow", "include": True, "runs": { - "int64artifact": {"flow": "np.arange(5,6)", "capacity": [2, 3]} + "int64artifact": {"flow": "np.arange(5, 7)", "capacity": [2, 3]} } }, ], @@ -104,22 +111,49 @@ def get_test_config(): return test_raw_config -def value_test_util(name: str, configurations: list): - assert name in config.config.algorithm_params, f"{name} isn't a present algorithm configuration!" - - keys = config.config.algorithm_params[name] - values = [config.config.algorithm_params[name][key] for key in keys] +def value_test_util(alg: str, run_name: str, param_type: type[BaseModel], configurations: Iterable[BaseModel]): + """ + Utility test function to be able to test against certain named runs + under algorithms. This is, unfortunately, a very holistic function that depends + on the current state of how config parsing is. + """ + assert alg in config.config.algorithm_params, f"{alg} isn't a present algorithm name!" + runs = config.config.algorithm_params[alg] + # Filter using the internal _spras_run_name key. + runs = {hash: params for hash, params in runs.items() if params["_spras_run_name"] == run_name} + + # We copy values so we don't mutate it + values: list[dict] = copy.deepcopy(list(runs.values())) + for value in values: + # then, remove the internal key for easy comparison. + del value["_spras_run_name"] + + # Since configurations is a bunch of objects, we need to turn those into dictionaries + # and exclude their defaults. + new_configurations = [config.model_dump(exclude_defaults=True) for config in configurations] + + # Same for values, but we reserialize them first + values = [param_type.model_validate(value).model_dump(exclude_defaults=True) for value in values] + + # Now, we need to also remove any dynamic values from values and configurations + # (_time and seeded values) + for value in values: + value.pop("_time", None) + value.pop("seed", None) + for configuration in new_configurations: + configuration.pop("_time", None) + configuration.pop("seed", None) # https://stackoverflow.com/a/50486270/7589775 # Note: We use pickle as we also compare dictionaries in these two sets - some kind of consistent total ordering # is required for the tests to consistently pass when comparing them to `configurations`. - set_values = set(tuple(sorted(d.items())) for d in sorted(values, key=lambda x: pickle.dumps(x, protocol=3))) - set_configurations = set(tuple(sorted(d.items())) for d in sorted(configurations, key=lambda x: pickle.dumps(x, protocol=3))) + final_values = sorted(tuple(sorted(d.items())) for d in sorted(values, key=lambda x: pickle.dumps(x, protocol=3))) + final_configurations = sorted(tuple(sorted(d.items())) for d in sorted(new_configurations, key=lambda x: pickle.dumps(x, protocol=3))) - if set_values != set_configurations: - print(f'Got: {set_values}') - print(f'Expected: {set_configurations}') - assert set_values == set_configurations + if final_values != final_configurations: + print(f'Got: {final_values}') + print(f'Expected: {final_configurations}') + assert final_values == final_configurations class TestConfig: """ @@ -225,17 +259,51 @@ def test_config_values(self): test_config = get_test_config() config.init_global(test_config) - value_test_util('strings', [{'test': "str1", 'test2': "str2"}, {'test': 'str1', 'test2': 'str3'}]) - value_test_util('numbersAndBools', [{'a': 1, 'b': float(2.0), 'c': 4, 'd': 5.6, 'f': False}, {'a': 1, 'b': 3, 'c': 4, 'd': 5.6, 'f': False}]) - - value_test_util('singleton_int64_with_array', [{'test': 1, 'test2': 2}, {'test': 1, 'test2': 3}]) - value_test_util('singleton_string_np_linspace', [{'test': "str1", 'test2': 5.0}, {'test': "str1", 'test2': 0.0}]) - value_test_util('str_array_np_logspace', [{'test': "a", 'test2': 10}] * 10 + [{'test': "b", 'test2': 10}] * 10) - - value_test_util('int64artifact', [{'test': 5, 'test2': 2}, {'test': 5, 'test2': 3}]) - - value_test_util('boolArrTest', [{'flags': True, 'range': 1}, {'flags': False, 'range': 2}, - {'flags': False, 'range': 1}, {'flags': True, 'range': 2}]) + value_test_util('omicsintegrator2', 'strings', OmicsIntegrator2Params, [ + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=3), + OmicsIntegrator2Params(dummy_mode=DummyMode.others, b=3) + ]) + + value_test_util('omicsintegrator2', 'singleton_string_np_linspace', OmicsIntegrator2Params, [ + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=5.0), + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=0.0) + ]) + + value_test_util('omicsintegrator2', 'str_array_np_logspace', OmicsIntegrator2Params, [ + # While these both repeat 50 times, parameter hash makes sure to not duplicate the work. + # This serves as a test to make sure _time isn't inserted during parameter combinations. + OmicsIntegrator2Params(dummy_mode=DummyMode.others, g=10), OmicsIntegrator2Params(dummy_mode=DummyMode.all, g=10) + ]) + + value_test_util('meo', 'numbersAndBools', MEOParams, [ + MEOParams(max_path_length=1, rand_restarts=2, local_search=False), + MEOParams(max_path_length=1, rand_restarts=2, local_search=True), + MEOParams(max_path_length=1, rand_restarts=3, local_search=False), + MEOParams(max_path_length=1, rand_restarts=3, local_search=True), + ]) + + # Encoding this behavior: run names are not passed into the parameter hash, + # and thus won't duplicate runs. + value_test_util('meo', 'numbersAndBoolsDuplicate', MEOParams, []) + + value_test_util('meo', 'numbersAndBool', MEOParams, [ + MEOParams(max_path_length=2, rand_restarts=2, local_search=True), + MEOParams(max_path_length=2, rand_restarts=3, local_search=True), + ]) + + value_test_util('mincostflow', 'int64artifact', MinCostFlowParams, [ + MinCostFlowParams(flow=5, capacity=2), + MinCostFlowParams(flow=5, capacity=3), + MinCostFlowParams(flow=6, capacity=2), + MinCostFlowParams(flow=6, capacity=3) + ]) + + value_test_util('meo', 'boolArrTest', MEOParams, [ + MEOParams(local_search=True, max_path_length=1), + MEOParams(local_search=True, max_path_length=2), + MEOParams(local_search=False, max_path_length=1), + MEOParams(local_search=False, max_path_length=2) + ]) @pytest.mark.parametrize("ml_include, eval_include, expected_ml, expected_eval", [ (True, True, True, True), From c93244ff32dddff416d7f21c838b28fef4ed9cc9 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 22:11:56 +0000 Subject: [PATCH 59/60] fix: correct all algorithm usage --- spras/allpairs.py | 3 ++- spras/btb.py | 3 ++- spras/domino.py | 23 ++++++++++++----------- spras/meo.py | 26 ++++++++++++++------------ spras/mincostflow.py | 19 ++++++++++--------- spras/omicsintegrator1.py | 16 +++++++++------- spras/omicsintegrator2.py | 17 +++++++++-------- spras/pathlinker.py | 17 +++++++++-------- spras/rwr.py | 16 +++++++++------- spras/strwr.py | 18 ++++++++++-------- 10 files changed, 86 insertions(+), 72 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index bba5df467..5c1476e8a 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -73,7 +73,8 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, output_file, args=None, container_settings=ProcessedContainerSettings()): + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() if not inputs["nodetypes"] or not inputs["network"] or not inputs["directed_flag"]: raise ValueError('Required All Pairs Shortest Paths arguments are missing') diff --git a/spras/btb.py b/spras/btb.py index 7f7a1b944..16bce75ae 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -66,7 +66,8 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, output_file, args=None, container_settings=ProcessedContainerSettings()): + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide diff --git a/spras/domino.py b/spras/domino.py index a45a445a2..d3d761e1f 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -5,6 +5,7 @@ import pandas as pd from pydantic import ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import NondeterministicModel from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -77,9 +78,9 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): - if not args: - args = DominoParams() + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = DominoParams() # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. if not inputs["network"] or not inputs["active_genes"]: @@ -90,19 +91,19 @@ def run(inputs, output_file, args=None, container_framework="docker"): # Each volume is a tuple (source, destination) volumes = list() - bind_path, network_file = prepare_volume(inputs["network"], work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) - bind_path, node_file = prepare_volume(inputs["active_genes"], work_dir) + bind_path, node_file = prepare_volume(inputs["active_genes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) slices_file = Path(out_dir, 'slices.txt') - bind_path, mapped_slices_file = prepare_volume(str(slices_file), work_dir) + bind_path, mapped_slices_file = prepare_volume(str(slices_file), work_dir, container_settings) volumes.append(bind_path) # Make the Python command to run within the container @@ -112,11 +113,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): container_suffix = "domino" run_container_and_log('slicer', - container_framework, container_suffix, slicer_command, volumes, - work_dir) + work_dir, + container_settings) # Make the Python command to run within the container domino_command = ['domino', @@ -136,11 +137,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): domino_command.extend(['--module_threshold', str(args.module_threshold)]) run_container_and_log('DOMINO', - container_framework, container_suffix, domino_command, volumes, - work_dir) + work_dir, + container_settings) # DOMINO creates a new folder in out_dir to output its modules HTML files into called active_genes # The filename is determined by the input active_genes and cannot be configured diff --git a/spras/meo.py b/spras/meo.py index 4b3f9299e..b3b8a5973 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( add_directionality_constant, @@ -145,7 +146,7 @@ def generate_inputs(data, filename_map): # TODO add parameter validation # TODO document required arguments @staticmethod - def run(inputs, output_file=None, args=None, container_framework="docker"): + def run(inputs, output_file=None, args=None, container_settings=None): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -154,8 +155,8 @@ def run(inputs, output_file=None, args=None, container_framework="docker"): Only the edge output file is retained. All other output files are deleted. """ - if not args: - args = MEOParams() + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = MEOParams() if inputs["edges"] is None or inputs["sources"] is None or inputs["targets"] is None: raise ValueError('Required Maximum Edge Orientation arguments are missing') @@ -165,44 +166,45 @@ def run(inputs, output_file=None, args=None, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, source_file = prepare_volume(inputs["sources"], work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(inputs["targets"], work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent # Maximum Edge Orientation requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_output_file = prepare_volume(str(output_file), work_dir) + bind_path, mapped_output_file = prepare_volume(str(output_file), work_dir, container_settings) volumes.append(bind_path) # Hard code the path output filename, which will be deleted path_output_file = Path(out_dir, 'path-output.txt') - bind_path, mapped_path_output = prepare_volume(str(path_output_file), work_dir) + bind_path, mapped_path_output = prepare_volume(str(path_output_file), work_dir, container_settings) volumes.append(bind_path) properties_file = 'meo-properties.txt' properties_file_local = Path(out_dir, properties_file) write_properties(filename=properties_file_local, edges=edge_file, sources=source_file, targets=target_file, edge_output=mapped_output_file, path_output=mapped_path_output, - max_path_length=args.max_path_length, local_search=args.local_search, rand_restarts=args.rand_restarts, framework=container_framework) - bind_path, properties_file = prepare_volume(str(properties_file_local), work_dir) + max_path_length=args.max_path_length, local_search=args.local_search, rand_restarts=args.rand_restarts, + framework=container_settings.framework) + bind_path, properties_file = prepare_volume(str(properties_file_local), work_dir, container_settings) volumes.append(bind_path) command = ['java', '-jar', '/meo/EOMain.jar', properties_file] container_suffix = "meo" run_container_and_log('Maximum Edge Orientation', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) properties_file_local.unlink(missing_ok=True) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 1f7ff0cf7..05dd22bf5 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, @@ -72,9 +73,9 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): - if not args: - args = MinCostFlowParams() + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = MinCostFlowParams() # ensures that these parameters are required if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: @@ -86,19 +87,19 @@ def run(inputs, output_file, args=None, container_framework="docker"): # the tuple is for mapping the sources, targets, edges, and output volumes = list() - bind_path, sources_file = prepare_volume(inputs["sources"], work_dir) + bind_path, sources_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, targets_file = prepare_volume(inputs["targets"], work_dir) + bind_path, targets_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(inputs["edges"], work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) # Create a prefix for the output filename and ensure the directory exists out_dir = Path(output_file).parent out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + '/out' @@ -121,11 +122,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): # constructs a docker run call run_container_and_log('MinCostFlow', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # Check the output of the container out_dir_content = sorted(out_dir.glob('*.sif')) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index d9ee603fb..9d1396902 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, ConfigDict, Field +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed from spras.prm import PRM @@ -142,7 +143,8 @@ def generate_inputs(data, filename_map): # TODO add support for knockout argument # TODO add reasonable default values @staticmethod - def run(inputs, output_file, args, container_framework="docker"): + def run(inputs, output_file, args, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() if inputs["edges"] is None or inputs["prizes"] is None or output_file is None: raise ValueError('Required Omics Integrator 1 arguments are missing') @@ -151,10 +153,10 @@ def run(inputs, output_file, args, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir, container_settings) volumes.append(bind_path) # 4 dummy mode possibilities: @@ -167,13 +169,13 @@ def run(inputs, output_file, args, container_framework="docker"): if args.dummy_mode == 'file': if inputs["dummy_nodes"] is None: raise ValueError("dummy_nodes file is required when dummy_mode is set to 'file'") - bind_path, dummy_file = prepare_volume(inputs["dummy_nodes"], work_dir) + bind_path, dummy_file = prepare_volume(inputs["dummy_nodes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent # Omics Integrator 1 requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) conf_file = 'oi1-configuration.txt' @@ -181,7 +183,7 @@ def run(inputs, output_file, args, container_framework="docker"): # Temporary file that will be deleted after running Omics Integrator 1 write_conf(conf_file_local, w=args.w, b=args.b, d=args.d, mu=args.mu, noise=args.noise, g=args.g, r=args.r) - bind_path, conf_file = prepare_volume(str(conf_file_local), work_dir) + bind_path, conf_file = prepare_volume(str(conf_file_local), work_dir, container_settings) volumes.append(bind_path) command = ['python', '/OmicsIntegrator/scripts/forest.py', @@ -213,11 +215,11 @@ def run(inputs, output_file, args, container_framework="docker"): container_suffix = "omics-integrator-1:no-conda" # no-conda version is the default run_container_and_log('Omics Integrator 1', - container_framework, container_suffix, # no-conda version is the default command, volumes, work_dir, + container_settings, {'TMPDIR': mapped_out_dir}) conf_file_local.unlink(missing_ok=True) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index aef4f3c48..8b5c29799 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -5,6 +5,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict, Field +from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -110,7 +111,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): + def run(inputs, output_file, args=None, container_settings=None): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. @@ -118,8 +119,8 @@ def run(inputs, output_file, args=None, container_framework="docker"): @param output_file: the name of the output file, which will overwrite any existing file with this name @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ - if not args: - args = OmicsIntegrator2Params() + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = OmicsIntegrator2Params() if inputs["edges"] is None or inputs["prizes"] is None: raise ValueError('Required Omics Integrator 2 arguments are missing') @@ -129,16 +130,16 @@ def run(inputs, output_file, args=None, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent # Omics Integrator 2 requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(out_dir, work_dir) + bind_path, mapped_out_dir = prepare_volume(out_dir, work_dir, container_settings) volumes.append(bind_path) command = ['OmicsIntegrator', '-e', edge_file, '-p', prize_file, @@ -164,11 +165,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): container_suffix = "omics-integrator-2:v2" run_container_and_log('Omics Integrator 2', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # TODO do we want to retain other output files? # TODO if deleting other output files, write them all to a tmp directory and copy diff --git a/spras/pathlinker.py b/spras/pathlinker.py index da0a91ba2..f71015f0e 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( @@ -76,9 +77,9 @@ def generate_inputs(data, filename_map): header=["#Interactor1","Interactor2","Weight"]) @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): - if not args: - args = PathLinkerParams() + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = PathLinkerParams() if not inputs["nodetypes"] or not inputs["network"]: raise ValueError('Required PathLinker arguments are missing') @@ -88,10 +89,10 @@ def run(inputs, output_file, args=None, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(inputs["network"], work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # PathLinker does not provide an argument to set the output directory @@ -99,7 +100,7 @@ def run(inputs, output_file, args=None, container_framework="docker"): out_dir = Path(output_file).parent # PathLinker requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + '/out' # Use posix path inside the container @@ -113,11 +114,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): container_suffix = "pathlinker:v2" run_container_and_log('PathLinker', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # Rename the primary output file to match the desired output filename # Currently PathLinker only writes one output file so we do not need to delete others diff --git a/spras/rwr.py b/spras/rwr.py index dff5bdb97..a46e734e6 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -4,6 +4,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_directed @@ -45,7 +46,8 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, output_file, args, container_framework="docker"): + def run(inputs, output_file, args, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() if not inputs["nodes"] or not inputs["network"]: raise ValueError('Required RWR arguments are missing') @@ -60,10 +62,10 @@ def run(inputs, output_file, args, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, nodes_file = prepare_volume(inputs["nodes"], work_dir) + bind_path, nodes_file = prepare_volume(inputs["nodes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(inputs["network"], work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # RWR does not provide an argument to set the output directory @@ -71,7 +73,7 @@ def run(inputs, output_file, args, container_framework="docker"): out_dir = Path(output_file).parent # RWR requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + "/output.txt" command = ['python', @@ -85,11 +87,11 @@ def run(inputs, output_file, args, container_framework="docker"): command.extend(['--alpha', str(args.alpha)]) container_suffix = 'rwr:v1' - out = run_container(container_framework, - container_suffix, + out = run_container(container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) print(out) # Rename the primary output file to match the desired output filename diff --git a/spras/strwr.py b/spras/strwr.py index 1b9159eff..28a76099e 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -4,6 +4,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_directed @@ -47,7 +48,8 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, output_file, args, container_framework="docker"): + def run(inputs, output_file, args, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() if not inputs["sources"] or not inputs["targets"] or not inputs["network"] or not output_file: raise ValueError('Required local_neighborhood arguments are missing') @@ -63,13 +65,13 @@ def run(inputs, output_file, args, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(inputs["sources"], work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(inputs["targets"], work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(inputs["network"], work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # ST_RWR does not provide an argument to set the output directory @@ -77,7 +79,7 @@ def run(inputs, output_file, args, container_framework="docker"): out_dir = Path(output_file).parent # ST_RWR requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + "/output.txt" command = ['python', @@ -92,11 +94,11 @@ def run(inputs, output_file, args, container_framework="docker"): command.extend(['--alpha', str(args.alpha)]) container_suffix = 'st-rwr:v1' - out = run_container(container_framework, - container_suffix, + out = run_container(container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) print(out) # Rename the primary output file to match the desired output filename From 69268f4ca83ee6d9977f12d0124e613df67e0ab1 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 23:10:59 +0000 Subject: [PATCH 60/60] chore: talk about resumability --- Snakefile | 2 +- spras/config/schema.py | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index 34681cb02..358a83f42 100644 --- a/Snakefile +++ b/Snakefile @@ -25,7 +25,7 @@ algorithm_params = _config.config.algorithm_params algorithm_directed = _config.config.algorithm_directed pca_params = _config.config.pca_params hac_params = _config.config.hac_params -FRAMEWORK = _config.config.container_framework +FRAMEWORK = _config.config.container_settings.framework # Return the dataset or gold_standard dictionary from the config file given the label def get_dataset(_datasets, label): diff --git a/spras/config/schema.py b/spras/config/schema.py index b2ff0b3bd..49100bef3 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -117,11 +117,27 @@ class ReconstructionSettings(BaseModel): model_config = ConfigDict(extra='forbid') class RawConfig(BaseModel): + resume: bool = Field(alias="_resume", default=False) + """ + Declares whether a config has resumability. This is meant to be used internally, as it + enforces some extra preconditions on the config (such that all defaults must be explicitly + declared within the config, and that it meets the specified hash). + + Unlike their nonresumable counterparts, these resumable configurations will store all configuration + defaults (including, most importantly, _time from NondeterministicModel and any seeded values). + + Resumable configurations are generated whenever a non-resumable configuration is run, inside + `{output}/resumables/{hash}.yaml`. The timestamp is present only for file ordering, and {hash} is a hash + of the configuration _excluding_ default values. + + By default, SPRAS runs through Snakemake will generate a resumable configuration if none is present, + or reuse the configuration associated with its hash otherwise. + """ + containers: ContainerSettings - hash_length: int = Field( - description="The length of the hash used to identify a parameter combination", - default=DEFAULT_HASH_LENGTH) + hash_length: int = DEFAULT_HASH_LENGTH + "The length of the hash used to identify a parameter combination" # See algorithms.py for more information about AlgorithmUnion algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this. @@ -131,4 +147,4 @@ class RawConfig(BaseModel): reconstruction_settings: ReconstructionSettings - model_config = ConfigDict(extra='forbid') + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True)