From 647f947636b061449996f0eff9cbb6af9bc450c3 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 11:27:29 -0700 Subject: [PATCH 01/68] feat: rough draft of args design --- spras/allpairs.py | 13 ++++++------ spras/btb.py | 32 +++++++++--------------------- spras/config/util.py | 8 ++++++++ spras/domino.py | 47 +++++++++++++++++++------------------------- spras/prm.py | 20 ++++++++++++++----- 5 files changed, 59 insertions(+), 61 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index 222794dbb..b1ffe2ee9 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -1,6 +1,7 @@ import warnings from pathlib import Path +from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( @@ -14,7 +15,7 @@ __all__ = ['AllPairs'] -class AllPairs(PRM): +class AllPairs(PRM[Empty]): required_inputs = ['nodetypes', 'network', 'directed_flag'] dois = [] @@ -71,7 +72,7 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(nodetypes=None, network=None, directed_flag=None, output_file=None, container_framework="docker"): + def run(inputs, args, output_file, container_framework="docker"): """ Run All Pairs Shortest Paths with Docker @param nodetypes: input node types with sources and targets (required) @@ -79,7 +80,7 @@ def run(nodetypes=None, network=None, directed_flag=None, output_file=None, cont @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) @param output_file: path to the output pathway file (required) """ - if not nodetypes or not network or not output_file or not directed_flag: + if not inputs["nodetypes"] or not inputs["network"] or not inputs["directed_flag"]: raise ValueError('Required All Pairs Shortest Paths arguments are missing') work_dir = '/apsp' @@ -87,10 +88,10 @@ def run(nodetypes=None, network=None, directed_flag=None, output_file=None, cont # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(nodetypes, work_dir) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir) volumes.append(bind_path) # Create the parent directories for the output file if needed @@ -103,7 +104,7 @@ def run(nodetypes=None, network=None, directed_flag=None, output_file=None, cont '--network', network_file, '--nodes', node_file, '--output', mapped_out_file] - if Path(directed_flag).read_text().strip() == "true": + if Path(inputs["directed_flag"]).read_text().strip() == "true": command.append("--directed") container_suffix = "allpairs:v4" diff --git a/spras/btb.py b/spras/btb.py index 35d33bb72..a4098ee08 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -1,5 +1,6 @@ from pathlib import Path +from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, @@ -23,19 +24,13 @@ Interactor1 Interactor2 Weight """ -class BowTieBuilder(PRM): +class BowTieBuilder(PRM[Empty]): required_inputs = ['sources', 'targets', 'edges'] dois = ["10.1186/1752-0509-3-67"] #generate input taken from meo.py beacuse they have same input requirements @staticmethod def generate_inputs(data, filename_map): - """ - Access fields from the dataset and write the required input files - @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: - """ for input_type in BowTieBuilder.required_inputs: if input_type not in filename_map: raise ValueError(f"{input_type} filename is missing") @@ -70,30 +65,21 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, container_framework="docker"): - """ - Run BTB with Docker - @param sources: input source file (required) - @param targets: input target file (required) - @param edges: input edge file (required) - @param output_file: path to the output pathway file (required) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - + def run(inputs, args, output_file, container_framework="docker"): # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide - if not sources or not targets or not edges or not output_file: + if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required BowTieBuilder arguments are missing') - if not Path(sources).exists() or not Path(targets).exists() or not Path(edges).exists(): + if not Path(inputs["sources"]).exists() or not Path(inputs["targets"]).exists() or not Path(inputs["edges"]).exists(): raise ValueError('Missing input file') # Testing for btb index errors # TODO: This error will never actually occur if the inputs are passed through # `generate_inputs`. See the discussion about removing this or making this a habit at # https://github.com/Reed-CompBio/spras/issues/306. - with open(edges, 'r') as edge_file: + with open(inputs["edges"], 'r') as edge_file: try: for line in edge_file: line = line.strip().split('\t')[2] @@ -107,13 +93,13 @@ def run(sources=None, targets=None, edges=None, output_file=None, container_fram # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(sources, work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(edges, work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir) volumes.append(bind_path) # Use its --output argument to set the output file prefix to specify an absolute path and prefix diff --git a/spras/config/util.py b/spras/config/util.py index b7680222b..c23374a50 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -1,4 +1,5 @@ from enum import Enum +from pydantic import BaseModel, ConfigDict from typing import Any @@ -17,3 +18,10 @@ def _missing_(cls, value: Any): if member.lower() == value: return member return None + + +class Empty(BaseModel): + """ + The empty base model. Used for specifying that an algorithm takes no parameters. + """ + model_config = ConfigDict(extra="forbid") diff --git a/spras/domino.py b/spras/domino.py index 5205a81cd..a70f1a1e3 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -2,6 +2,8 @@ from pathlib import Path import pandas as pd +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -16,6 +18,14 @@ ID_PREFIX = 'ENSG0' ID_PREFIX_LEN = len(ID_PREFIX) +class DominoParams(BaseModel): + module_threshold: Optional[float] + "the p-value threshold for considering a slice as relevant (optional)" + + slice_threshold: Optional[float] + "the p-value threshold for considering a putative module as final module (optional)" + + model_config = ConfigDict(use_attribute_docstrings=True) """ DOMINO will construct a fully undirected graph from the provided input file @@ -26,18 +36,12 @@ - the expected raw input file should have node pairs in the 1st and 3rd columns, with a 'ppi' in the 2nd column - it can include repeated and bidirectional edges """ -class DOMINO(PRM): +class DOMINO(PRM[DominoParams]): required_inputs = ['network', 'active_genes'] dois = ["10.15252/msb.20209593"] @staticmethod def generate_inputs(data, filename_map): - """ - Access fields from the dataset and write the required input files - @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: - """ for input_type in DOMINO.required_inputs: if input_type not in filename_map: raise ValueError(f"{input_type} filename is missing") @@ -72,20 +76,9 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(network=None, active_genes=None, output_file=None, slice_threshold=None, module_threshold=None, container_framework="docker"): - """ - Run DOMINO with Docker. - Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. - DOMINO produces multiple output module files in an HTML format. SPRAS concatenates these files into one file. - @param network: input network file (required) - @param active_genes: input active genes (required) - @param output_file: path to the output pathway file (required) - @param slice_threshold: the p-value threshold for considering a slice as relevant (optional) - @param module_threshold: the p-value threshold for considering a putative module as final module (optional) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - - if not network or not active_genes or not output_file: + def run(inputs, args, output_file, container_framework="docker"): + # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. + if not inputs["network"] or not inputs["active_genes"]: raise ValueError('Required DOMINO arguments are missing') work_dir = '/spras' @@ -93,10 +86,10 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, # Each volume is a tuple (source, destination) volumes = list() - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir) volumes.append(bind_path) - bind_path, node_file = prepare_volume(active_genes, work_dir) + bind_path, node_file = prepare_volume(inputs["active_genes"], work_dir) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -132,11 +125,11 @@ def run(network=None, active_genes=None, output_file=None, slice_threshold=None, '--visualization', 'true'] # Add optional arguments - if slice_threshold is not None: + if args.slice_threshold is not None: # DOMINO readme has the wrong argument https://github.com/Shamir-Lab/DOMINO/issues/12 - domino_command.extend(['--slice_threshold', str(slice_threshold)]) - if module_threshold is not None: - domino_command.extend(['--module_threshold', str(module_threshold)]) + domino_command.extend(['--slice_threshold', str(args.slice_threshold)]) + if args.module_threshold is not None: + domino_command.extend(['--module_threshold', str(args.module_threshold)]) run_container_and_log('DOMINO', container_framework, diff --git a/spras/prm.py b/spras/prm.py index b5d8501dd..06d005b2a 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,11 +1,12 @@ -import typing from abc import ABC, abstractmethod -from typing import Any +from pydantic import BaseModel +from typing import Any, cast, TypeVar, Generic from spras.dataset import Dataset +T = TypeVar('T', bound=BaseModel) -class PRM(ABC): +class PRM(ABC, Generic[T]): """ The PRM (Pathway Reconstruction Module) class, which defines the interface that `runner.py` uses to handle @@ -15,7 +16,7 @@ class PRM(ABC): required_inputs: list[str] = [] # DOIs aren't strictly required (e.g. local neighborhood), # but it should be explicitly declared that there are no DOIs. - dois: list[str] = typing.cast(list[str], None) + dois: list[str] = cast(list[str], None) def __init_subclass__(cls): # modified from https://stackoverflow.com/a/58206480/7589775 @@ -30,11 +31,20 @@ def __init_subclass__(cls): @staticmethod @abstractmethod def generate_inputs(data: Dataset, filename_map: dict[str, str]): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + """ raise NotImplementedError @staticmethod @abstractmethod - def run(**kwargs): + def run(inputs: dict[str, str], args: T, output_file: str, container_framework="docker"): + """ + Runs an algorithm with the specified inputs, algorithm params (T), + the designated output_file, and the desired container_framework. + """ raise NotImplementedError @staticmethod From 76011e07978d38fd41dded82434a3e8a5f210154 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 18:57:10 +0000 Subject: [PATCH 02/68] feat: type oi1/oi2, rwr/strwr --- spras/omicsintegrator1.py | 103 ++++++++++++++++++++++++-------------- spras/omicsintegrator2.py | 76 ++++++++++++++++++++-------- spras/rwr.py | 24 ++++++--- spras/strwr.py | 27 ++++++---- 4 files changed, 152 insertions(+), 78 deletions(-) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 0d3eb4bfd..7a69a01d6 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,4 +1,6 @@ from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed @@ -35,8 +37,47 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi f.write('processes = 1\n') f.write('threads = 1\n') +class OmicsIntegrator1Params(BaseModel): + dummy_mode: Optional[str] + mu_squared: Optional[str] + exclude_terms: Optional[str] -class OmicsIntegrator1(PRM): + noisy_edges: Optional[str] + "How many times you would like to add noise to the given edge values and re-run the algorithm." + + shuffled_prizes: Optional[int] + "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run" + + random_terminals: Optional[int] + "How many times to apply the given prizes to random nodes in the interactome" + + seed: Optional[str] + "the randomness seed to use" + + w: Optional[float] + "the number of trees" + + b: Optional[str] + "the trade-off between including more terminals and using less reliable edges" + + d: Optional[str] + "controls the maximum path-length from v0 to terminal nodes" + + mu: Optional[float] + "controls the degree-based negative prizes (defualt 0.0)" + + noise: Optional[str] + "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations" + + g: Optional[str] + "(Gamma) multiplicative edge penalty from degree of endpoints" + + r: Optional[str] + "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)" + + model_config = ConfigDict(use_attribute_docstrings=True) + +class OmicsIntegrator1(PRM[OmicsIntegrator1Params]): """ Omics Integrator 1 works with partially directed graphs - it takes in the universal input directly @@ -96,27 +137,12 @@ def generate_inputs(data, filename_map): with open(filename_map['dummy_nodes'], mode='w'): pass - # TODO add parameter validation # TODO add support for knockout argument # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=None, exclude_terms=None, - output_file=None, noisy_edges=None, shuffled_prizes=None, random_terminals=None, - seed=None, w=None, b=None, d=None, mu=None, noise=None, g=None, r=None, container_framework="docker"): - """ - Run Omics Integrator 1 in the Docker image with the provided parameters. - Does not support the garnet, cyto30, knockout, cv, or cv-reps arguments. - The configuration file is generated from the provided arguments. - Does not support the garnetBeta, processes, or threads configuration file parameters. - The msgpath is not required because msgsteiner is available in the Docker image. - Only the optimal forest sif file is retained. - All other output files are deleted. - @param output_file: the name of the output sif file for the optimal forest, which will overwrite any - existing file with this name - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - if edges is None or prizes is None or output_file is None or w is None or b is None or d is None: + def run(inputs, args, output_file, container_framework="docker"): + if inputs["edges"] is None or inputs["prizes"] is None or output_file is None or w is None or b is None or d is None: raise ValueError('Required Omics Integrator 1 arguments are missing') work_dir = '/spras' @@ -124,10 +150,10 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(prizes, work_dir) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir) volumes.append(bind_path) # 4 dummy mode possibilities: @@ -137,10 +163,10 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N # 4. file -> connect the dummy node to a specific list of nodes provided in a file # add dummy node file to the volume if dummy_mode is not None and it is 'file' - if dummy_mode == 'file': - if dummy_nodes is None: + if args.dummy_mode == 'file': + if inputs["dummy_nodes"] is None: raise ValueError("dummy_nodes file is required when dummy_mode is set to 'file'") - bind_path, dummy_file = prepare_volume(dummy_nodes, work_dir) + bind_path, dummy_file = prepare_volume(inputs["dummy_nodes"], work_dir) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -152,7 +178,8 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N conf_file = 'oi1-configuration.txt' conf_file_local = Path(out_dir, conf_file) # Temporary file that will be deleted after running Omics Integrator 1 - write_conf(conf_file_local, w=w, b=b, d=d, mu=mu, noise=noise, g=g, r=r) + write_conf(conf_file_local, w=args.w, b=args.b, d=args.d, mu=args.mu, + noise=args.noise, g=args.g, r=args.r) bind_path, conf_file = prepare_volume(str(conf_file_local), work_dir) volumes.append(bind_path) @@ -165,27 +192,27 @@ def run(edges=None, prizes=None, dummy_nodes=None, dummy_mode=None, mu_squared=N '--outlabel', 'oi1'] # add the dummy mode argument - if dummy_mode is not None and dummy_mode: + if args.dummy_mode is not None and args.dummy_mode: # for custom dummy modes, add the file - if dummy_mode == 'file': - command.extend(['--dummyMode', dummy_file]) + if args.dummy_mode == 'file': + command.extend(['--dummyMode', inputs["dummy_file"]]) # else pass in the dummy_mode and let oi1 handle it else: - command.extend(['--dummyMode', dummy_mode]) + command.extend(['--dummyMode', args.dummy_mode]) # Add optional arguments - if mu_squared is not None and mu_squared: + if args.mu_squared is not None and args.mu_squared: command.extend(['--musquared']) - if exclude_terms is not None and exclude_terms: + if args.exclude_terms is not None and args.exclude_terms: command.extend(['--excludeTerms']) - if noisy_edges is not None: - command.extend(['--noisyEdges', str(noisy_edges)]) - if shuffled_prizes is not None: - command.extend(['--shuffledPrizes', str(shuffled_prizes)]) - if random_terminals is not None: - command.extend(['--randomTerminals', str(random_terminals)]) - if seed is not None: - command.extend(['--seed', str(seed)]) + if args.noisy_edges is not None: + command.extend(['--noisyEdges', str(args.noisy_edges)]) + if args.shuffled_prizes is not None: + command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) + if args.random_terminals is not None: + command.extend(['--randomTerminals', str(args.random_terminals)]) + if args.seed is not None: + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-1:no-conda" # no-conda version is the default run_container_and_log('Omics Integrator 1', diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 355d71bd6..42dc466cd 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -1,4 +1,6 @@ from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional import pandas as pd @@ -10,6 +12,36 @@ __all__ = ['OmicsIntegrator2'] +class OmicsIntegrator2Params(BaseModel): + w: float = 6 + "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode" + + b: float = 1 + "Beta: scaling factor of prizes" + + g: float = 20 + "Gamma: multiplicative edge penalty from degree of endpoints" + + noise: Optional[str] + "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations." + + noisy_edges: Optional[int] + "An integer specifying how many times to add noise to the given edge values and re-run." + + random_terminals: Optional[str] + "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run" + + dummy_mode: Optional[str] + """ + Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals) + "terminals" = connect to all terminals + "others" = connect to all nodes except for terminals + "all" = connect to all nodes in the interactome. + """ + + seed: Optional[str] + "The random seed to use for this run." + """ Omics Integrator 2 will construct a fully undirected graph from the provided input file - in the algorithm, it uses nx.Graph() objects, which are undirected @@ -20,11 +52,12 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column - it can include repeated and bidirectional edges """ -class OmicsIntegrator2(PRM): +class OmicsIntegrator2(PRM[OmicsIntegrator2Params]): required_inputs = ['prizes', 'edges'] # OI2 does not have a specific paper. Instead, we link to the OI1 paper. dois = ["10.1371/journal.pcbi.1004879"] + @staticmethod def generate_inputs(data: Dataset, filename_map): """ Access fields from the dataset and write the required input files. @@ -69,8 +102,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise=None, noisy_edges=None, - random_terminals=None, dummy_mode=None, seed=None, container_framework="docker"): + def run(inputs, args, output_file, container_framework="docker"): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. @@ -78,7 +110,7 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise @param output_file: the name of the output file, which will overwrite any existing file with this name @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ - if edges is None or prizes is None or output_file is None: + if inputs["edges"] is None or inputs["prizes"] is None: raise ValueError('Required Omics Integrator 2 arguments are missing') work_dir = '/spras' @@ -86,10 +118,10 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(prizes, work_dir) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -102,23 +134,23 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise '-o', mapped_out_dir, '--filename', 'oi2'] # Add optional arguments - if w is not None: - command.extend(['-w', str(w)]) - if b is not None: - command.extend(['-b', str(b)]) - if g is not None: - command.extend(['-g', str(g)]) - if noise is not None: - command.extend(['-noise', str(noise)]) - if noisy_edges is not None: - command.extend(['--noisy_edges', str(noisy_edges)]) - if random_terminals is not None: - command.extend(['--random_terminals', str(random_terminals)]) - if dummy_mode is not None: + if args.w is not None: + command.extend(['-w', str(args.w)]) + if args.w is not None: + command.extend(['-b', str(args.b)]) + if args.w is not None: + command.extend(['-g', str(args.g)]) + if args.noise is not None: + command.extend(['-noise', str(args.noise)]) + if args.noisy_edges is not None: + command.extend(['--noisy_edges', str(args.noisy_edges)]) + if args.random_terminals is not None: + command.extend(['--random_terminals', str(args.random_terminals)]) + if args.dummy_mode is not None: # This argument does not follow the other naming conventions - command.extend(['--dummyMode', str(dummy_mode)]) - if seed is not None: - command.extend(['--seed', str(seed)]) + command.extend(['--dummyMode', str(args.dummy_mode)]) + if args.seed is not None: + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-2:v2" run_container_and_log('Omics Integrator 2', diff --git a/spras/rwr.py b/spras/rwr.py index 5c08d6777..12fc5d422 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -1,4 +1,6 @@ from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional import pandas as pd @@ -10,7 +12,13 @@ __all__ = ['RWR'] -class RWR(PRM): +class RWRParams(BaseModel): + threshold: Optional[int] + alpha: Optional[float] + + model_config = ConfigDict(use_attribute_docstrings=True) + +class RWR(PRM[RWRParams]): required_inputs = ['network','nodes'] dois = [] @@ -34,11 +42,11 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(network=None, nodes=None, alpha=None, output_file=None, container_framework="docker", threshold=None): - if not nodes: + def run(inputs, args, output_file, container_framework="docker"): + if not inputs["nodes"] or not inputs["network"]: raise ValueError('Required RWR arguments are missing') - with Path(network).open() as network_f: + with Path(inputs["network"]).open() as network_f: for line in network_f: line = line.strip() endpoints = line.split("|") @@ -49,10 +57,10 @@ def run(network=None, nodes=None, alpha=None, output_file=None, container_framew # Each volume is a tuple (src, dest) volumes = list() - bind_path, nodes_file = prepare_volume(nodes, work_dir) + bind_path, nodes_file = prepare_volume(inputs["nodes"], work_dir) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir) volumes.append(bind_path) # RWR does not provide an argument to set the output directory @@ -70,8 +78,8 @@ def run(network=None, nodes=None, alpha=None, output_file=None, container_framew '--output', mapped_out_prefix] # Add alpha as an optional argument - if alpha is not None: - command.extend(['--alpha', str(alpha)]) + if args.alpha is not None: + command.extend(['--alpha', str(args.alpha)]) container_suffix = 'rwr:v1' out = run_container(container_framework, diff --git a/spras/strwr.py b/spras/strwr.py index fc8536507..6693d7f5e 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -1,6 +1,7 @@ from pathlib import Path - import pandas as pd +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container from spras.dataset import Dataset @@ -10,8 +11,14 @@ __all__ = ['ST_RWR'] +class ST_RWRParams(BaseModel): + threshold: Optional[int] + alpha: Optional[float] + + model_config = ConfigDict(use_attribute_docstrings=True) + # Note: This class is almost identical to the rwr.py file. -class ST_RWR(PRM): +class ST_RWR(PRM[ST_RWRParams]): required_inputs = ['network','sources','targets'] dois = [] @@ -36,11 +43,11 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(network=None, sources=None, targets=None, alpha=None, output_file=None, container_framework="docker", threshold=None): - if not sources or not targets or not network or not output_file: + def run(inputs, args, output_file, container_framework="docker"): + if not inputs["sources"] or not inputs["targets"] or not inputs["network"] or not output_file: raise ValueError('Required local_neighborhood arguments are missing') - with Path(network).open() as network_f: + with Path(inputs["network"]).open() as network_f: for line in network_f: line = line.strip() endpoints = line.split("|") @@ -52,13 +59,13 @@ def run(network=None, sources=None, targets=None, alpha=None, output_file=None, # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(sources, work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir) volumes.append(bind_path) # ST_RWR does not provide an argument to set the output directory @@ -77,8 +84,8 @@ def run(network=None, sources=None, targets=None, alpha=None, output_file=None, '--output', mapped_out_prefix] # Add alpha as an optional argument - if alpha is not None: - command.extend(['--alpha', str(alpha)]) + if args.alpha is not None: + command.extend(['--alpha', str(args.alpha)]) container_suffix = 'st-rwr:v1' out = run_container(container_framework, From 94b50c81fc1da7577c4f056336cb89411676da8e Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 12:27:46 -0700 Subject: [PATCH 03/68] refactor: meo, mcf, pl types --- spras/meo.py | 32 ++++++++++++++++++++++++-------- spras/mincostflow.py | 42 +++++++++++++++++++++--------------------- spras/pathlinker.py | 24 ++++++++++++++++-------- 3 files changed, 61 insertions(+), 37 deletions(-) diff --git a/spras/meo.py b/spras/meo.py index d4d79bf9f..06f041786 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -1,5 +1,7 @@ import os from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -65,6 +67,21 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, # Do not need csp.phase, csp.gen.file, or csp.sol.file because MAXCSP is not supported +class MEOParams(BaseModel): + max_path_length: Optional[str] + "the maximal length of a path from sources and targets to orient." + + local_search: Optional[str] + """ + a "Yes"/"No" parameter that enables MEO's local search functionality. + See "Improving approximations with local search" in the associated paper + for more information. + """ + + rand_restarts: Optional[int] + "The number of random restarts to do." + + model_config = ConfigDict(use_attribute_docstrings=True) """ MEO can support partially directed graphs @@ -82,7 +99,7 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, """ -class MEO(PRM): +class MEO(PRM[MEOParams]): required_inputs = ['sources', 'targets', 'edges'] dois = ["10.1093/nar/gkq1207"] @@ -126,8 +143,7 @@ def generate_inputs(data, filename_map): # TODO add parameter validation # TODO document required arguments @staticmethod - def run(edges=None, sources=None, targets=None, output_file=None, max_path_length=None, local_search=None, - rand_restarts=None, container_framework="docker"): + def run(inputs, args, output_file=None, container_framework="docker"): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -138,7 +154,7 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt @param output_file: the name of the output edge file, which will overwrite any existing file with this name @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ - if edges is None or sources is None or targets is None or output_file is None: + if inputs["edges"] is None or inputs["sources"] is None or inputs["targets"] is None: raise ValueError('Required Maximum Edge Orientation arguments are missing') work_dir = '/spras' @@ -146,13 +162,13 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(edges, work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) volumes.append(bind_path) - bind_path, source_file = prepare_volume(sources, work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir) volumes.append(bind_path) - bind_path, target_file = prepare_volume(targets, work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir) volumes.append(bind_path) out_dir = Path(output_file).parent @@ -171,7 +187,7 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt properties_file_local = Path(out_dir, properties_file) write_properties(filename=properties_file_local, edges=edge_file, sources=source_file, targets=target_file, edge_output=mapped_output_file, path_output=mapped_path_output, - max_path_length=max_path_length, local_search=local_search, rand_restarts=rand_restarts, framework=container_framework) + max_path_length=args.max_path_length, local_search=args.local_search, rand_restarts=args.rand_restarts, framework=container_framework) bind_path, properties_file = prepare_volume(str(properties_file_local), work_dir) volumes.append(bind_path) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index d2d458b02..77f493f14 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -1,4 +1,6 @@ from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -10,6 +12,15 @@ __all__ = ['MinCostFlow'] +class MinCostFlowParams(BaseModel): + flow: Optional[float] + "amount of flow going through the graph" + + capacity: Optional[float] + "amount of capacity allowed on each edge" + + model_config = ConfigDict(use_attribute_docstrings=True) + """ MinCostFlow deals with fully directed graphs - OR Tools MCF is designed for directed graphs @@ -22,7 +33,7 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with the weight in the 3rd column - it can include repeated and bidirectional edges """ -class MinCostFlow (PRM): +class MinCostFlow(PRM[MinCostFlowParams]): required_inputs = ['sources', 'targets', 'edges'] dois = ["10.1038/s41540-020-00167-1"] @@ -60,20 +71,9 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, flow=None, capacity=None, container_framework="docker"): - """ - Run min cost flow with Docker (or singularity) - @param sources: input sources (required) - @param targets: input targets (required) - @param edges: input network file (required) - @param output_file: output file name (required) - @param flow: amount of flow going through the graph (optional) - @param capacity: amount of capacity allowed on each edge (optional) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - + def run(inputs, args, output_file, container_framework="docker"): # ensures that these parameters are required - if not sources or not targets or not edges or not output_file: + if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required MinCostFlow arguments are missing') # the data files will be mapped within this directory within the container @@ -82,13 +82,13 @@ def run(sources=None, targets=None, edges=None, output_file=None, flow=None, cap # the tuple is for mapping the sources, targets, edges, and output volumes = list() - bind_path, sources_file = prepare_volume(sources, work_dir) + bind_path, sources_file = prepare_volume(inputs["sources"], work_dir) volumes.append(bind_path) - bind_path, targets_file = prepare_volume(targets, work_dir) + bind_path, targets_file = prepare_volume(inputs["targets"], work_dir) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(edges, work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir) volumes.append(bind_path) # Create a prefix for the output filename and ensure the directory exists @@ -107,10 +107,10 @@ def run(sources=None, targets=None, edges=None, output_file=None, flow=None, cap '--output', mapped_out_prefix] # Optional arguments (extend the command if available) - if flow is not None: - command.extend(['--flow', str(flow)]) - if capacity is not None: - command.extend(['--capacity', str(capacity)]) + if args.flow is not None: + command.extend(['--flow', str(args.flow)]) + if args.capacity is not None: + command.extend(['--capacity', str(args.capacity)]) # choosing to run in docker or singularity container container_suffix = "mincostflow" diff --git a/spras/pathlinker.py b/spras/pathlinker.py index d0504c489..8852b959a 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -1,5 +1,7 @@ import warnings from pathlib import Path +from pydantic import BaseModel, ConfigDict +from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -12,6 +14,12 @@ __all__ = ['PathLinker'] +class PathLinkerParams(BaseModel): + k: Optional[int] + "path length (optional)" + + model_config = ConfigDict(use_attribute_docstrings=True) + """ Pathlinker will construct a fully directed graph from the provided input file - an edge is represented with a head and tail node, which represents the direction of the interation between two nodes @@ -22,7 +30,7 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column - it can include repeated and bidirectional edges """ -class PathLinker(PRM): +class PathLinker(PRM[PathLinkerParams]): required_inputs = ['nodetypes', 'network'] dois = ["10.1038/npjsba.2016.2", "10.1089/cmb.2012.0274"] @@ -68,20 +76,20 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(nodetypes=None, network=None, output_file=None, k=None, container_framework="docker"): + def run(inputs, args, output_file, container_framework="docker"): """ Run PathLinker with Docker @param nodetypes: input node types with sources and targets (required) @param network: input network file (required) @param output_file: path to the output pathway file (required) - @param k: path length (optional) + @param k: @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ # Add additional parameter validation # Do not require k # Use the PathLinker default # Could consider setting the default here instead - if not nodetypes or not network or not output_file: + if not inputs["nodetypes"] or not inputs["network"]: raise ValueError('Required PathLinker arguments are missing') work_dir = '/spras' @@ -89,10 +97,10 @@ def run(nodetypes=None, network=None, output_file=None, k=None, container_framew # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(nodetypes, work_dir) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir) volumes.append(bind_path) - bind_path, network_file = prepare_volume(network, work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir) volumes.append(bind_path) # PathLinker does not provide an argument to set the output directory @@ -111,8 +119,8 @@ def run(nodetypes=None, network=None, output_file=None, k=None, container_framew '--output', mapped_out_prefix] # Add optional argument - if k is not None: - command.extend(['-k', str(k)]) + if args.k is not None: + command.extend(['-k', str(args.k)]) container_suffix = "pathlinker:v2" run_container_and_log('PathLinker', From 09fa1bac0301b54842f05d284ba3b0a2390e3d11 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 12:51:33 -0700 Subject: [PATCH 04/68] chore: begin slowly updating --- spras/allpairs.py | 2 +- spras/btb.py | 2 +- spras/containers.py | 4 +- spras/domino.py | 8 +- spras/meo.py | 17 ++--- spras/mincostflow.py | 6 +- spras/omicsintegrator1.py | 2 +- spras/omicsintegrator2.py | 3 +- spras/pathlinker.py | 3 +- spras/prm.py | 3 +- spras/rwr.py | 2 +- spras/strwr.py | 2 +- test/AllPairs/test_ap.py | 65 +++++++--------- test/BowTieBuilder/test_btb.py | 133 ++++++++++++++++----------------- test/DOMINO/test_domino.py | 38 ++++------ test/MEO/test_meo.py | 30 ++++---- 16 files changed, 149 insertions(+), 171 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index b1ffe2ee9..15a3b17f7 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -72,7 +72,7 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args=Empty(), container_framework="docker"): """ Run All Pairs Shortest Paths with Docker @param nodetypes: input node types with sources and targets (required) diff --git a/spras/btb.py b/spras/btb.py index a4098ee08..6ad3afb69 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -65,7 +65,7 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args=Empty(), container_framework="docker"): # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide diff --git a/spras/containers.py b/spras/containers.py index 9a1568fdd..314d4bb45 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -369,7 +369,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ # Because this is called independently for each file, the same local path can be mounted to multiple volumes -def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]: +def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]: """ Makes a file on the local file system accessible within a container by mapping the local (source) path to a new container (destination) path and renaming the file to be relative to the destination path. @@ -385,7 +385,7 @@ def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PureP if not base_path.is_absolute(): raise ValueError(f'Volume base must be an absolute path: {volume_base}') - if isinstance(filename, PurePath): + if isinstance(filename, os.PathLike): filename = str(filename) filename_hash = hash_filename(filename, config.config.hash_length) diff --git a/spras/domino.py b/spras/domino.py index a70f1a1e3..187e53836 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -13,16 +13,16 @@ from spras.prm import PRM from spras.util import duplicate_edges -__all__ = ['DOMINO', 'pre_domino_id_transform', 'post_domino_id_transform'] +__all__ = ['DOMINO', 'DominoParams', 'pre_domino_id_transform', 'post_domino_id_transform'] ID_PREFIX = 'ENSG0' ID_PREFIX_LEN = len(ID_PREFIX) class DominoParams(BaseModel): - module_threshold: Optional[float] + module_threshold: Optional[float] = None "the p-value threshold for considering a slice as relevant (optional)" - slice_threshold: Optional[float] + slice_threshold: Optional[float] = None "the p-value threshold for considering a putative module as final module (optional)" model_config = ConfigDict(use_attribute_docstrings=True) @@ -76,7 +76,7 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args=DominoParams(), container_framework="docker"): # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. if not inputs["network"] or not inputs["active_genes"]: raise ValueError('Required DOMINO arguments are missing') diff --git a/spras/meo.py b/spras/meo.py index 06f041786..0451cb4c0 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -11,7 +11,7 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['MEO', 'write_properties'] +__all__ = ['MEO', 'MEOParams', 'write_properties'] # replaces all underscores in the node names with unicode seperator # MEO keeps only the substring up to the first underscore when parsing node names @@ -58,7 +58,8 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, if max_path_length is not None: f.write(f'max.path.length = {max_path_length}\n') if local_search is not None: - f.write(f'local.search = {local_search}\n') + # Yes/No for this parameter. + f.write(f'local.search = {"Yes" if local_search else "No"}\n') if rand_restarts is not None: f.write(f'rand.restarts = {rand_restarts}\n') @@ -68,17 +69,17 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None, # Do not need csp.phase, csp.gen.file, or csp.sol.file because MAXCSP is not supported class MEOParams(BaseModel): - max_path_length: Optional[str] + max_path_length: Optional[int] = None "the maximal length of a path from sources and targets to orient." - local_search: Optional[str] + local_search: Optional[bool] = None """ - a "Yes"/"No" parameter that enables MEO's local search functionality. + a boolean parameter that enables MEO's local search functionality. See "Improving approximations with local search" in the associated paper for more information. """ - rand_restarts: Optional[int] + rand_restarts: Optional[int] = None "The number of random restarts to do." model_config = ConfigDict(use_attribute_docstrings=True) @@ -143,7 +144,7 @@ def generate_inputs(data, filename_map): # TODO add parameter validation # TODO document required arguments @staticmethod - def run(inputs, args, output_file=None, container_framework="docker"): + def run(inputs, args=MEOParams(), output_file=None, container_framework="docker"): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -151,8 +152,6 @@ def run(inputs, args, output_file=None, container_framework="docker"): Does not support MINSAT or MAXCSP. Only the edge output file is retained. All other output files are deleted. - @param output_file: the name of the output edge file, which will overwrite any existing file with this name - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ if inputs["edges"] is None or inputs["sources"] is None or inputs["targets"] is None: raise ValueError('Required Maximum Edge Orientation arguments are missing') diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 77f493f14..986c1c8eb 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -13,10 +13,10 @@ __all__ = ['MinCostFlow'] class MinCostFlowParams(BaseModel): - flow: Optional[float] + flow: Optional[float] = None "amount of flow going through the graph" - capacity: Optional[float] + capacity: Optional[float] = None "amount of capacity allowed on each edge" model_config = ConfigDict(use_attribute_docstrings=True) @@ -71,7 +71,7 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args=MinCostFlowParams(), container_framework="docker"): # ensures that these parameters are required if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required MinCostFlow arguments are missing') diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 7a69a01d6..3361f5d2a 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -141,7 +141,7 @@ def generate_inputs(data, filename_map): # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args, container_framework="docker"): if inputs["edges"] is None or inputs["prizes"] is None or output_file is None or w is None or b is None or d is None: raise ValueError('Required Omics Integrator 1 arguments are missing') diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 42dc466cd..20351833e 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -98,11 +98,10 @@ def generate_inputs(data: Dataset, filename_map): edges_df.to_csv(filename_map['edges'], sep='\t', index=False, columns=['Interactor1', 'Interactor2', 'cost'], header=['protein1', 'protein2', 'cost']) - # TODO add parameter validation # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args, container_framework="docker"): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 8852b959a..3c78ffb84 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -74,9 +74,8 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map["network"],sep="\t",index=False,columns=["Interactor1","Interactor2","Weight"], header=["#Interactor1","Interactor2","Weight"]) - # Skips parameter validation step @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args, container_framework="docker"): """ Run PathLinker with Docker @param nodetypes: input node types with sources and targets (required) diff --git a/spras/prm.py b/spras/prm.py index 06d005b2a..1692f11f6 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from pydantic import BaseModel from typing import Any, cast, TypeVar, Generic +import os from spras.dataset import Dataset @@ -40,7 +41,7 @@ def generate_inputs(data: Dataset, filename_map: dict[str, str]): @staticmethod @abstractmethod - def run(inputs: dict[str, str], args: T, output_file: str, container_framework="docker"): + def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_framework="docker"): """ Runs an algorithm with the specified inputs, algorithm params (T), the designated output_file, and the desired container_framework. diff --git a/spras/rwr.py b/spras/rwr.py index 12fc5d422..12df71e01 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -42,7 +42,7 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args, container_framework="docker"): if not inputs["nodes"] or not inputs["network"]: raise ValueError('Required RWR arguments are missing') diff --git a/spras/strwr.py b/spras/strwr.py index 6693d7f5e..c603f9196 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -43,7 +43,7 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, args, output_file, container_framework="docker"): + def run(inputs, output_file, args, container_framework="docker"): if not inputs["sources"] or not inputs["targets"] or not inputs["network"] or not output_file: raise ValueError('Required local_neighborhood arguments are missing') diff --git a/test/AllPairs/test_ap.py b/test/AllPairs/test_ap.py index 8d094561f..ee76d0ce7 100644 --- a/test/AllPairs/test_ap.py +++ b/test/AllPairs/test_ap.py @@ -45,11 +45,10 @@ def test_allpairs(self): out_path = OUT_DIR.joinpath('sample-out.txt') out_path.unlink(missing_ok=True) # Only include required arguments - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path) + AllPairs.run({"nodetypes": str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input' / 'sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path) ) assert out_path.exists() @@ -57,9 +56,8 @@ def test_allpairs_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No nodetypes - AllPairs.run( - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - output_file=str(OUT_DIR / 'sample-out.txt')) + AllPairs.run({"network": str(TEST_DIR / 'input' / 'sample-in-net.txt')}, + output_file=str(OUT_DIR / 'sample-out.txt')) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -68,12 +66,11 @@ def test_allpairs_singularity(self): out_path = OUT_DIR / 'sample-out.txt' out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path), - container_framework="singularity") + AllPairs.run({"nodetypes": str(TEST_DIR / 'input' / 'sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input' / 'sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path), + container_framework="singularity") assert out_path.exists() @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') @@ -82,12 +79,11 @@ def test_allpairs_singularity_unpacked(self): out_path.unlink(missing_ok=True) # Indicate via config mechanism that we want to unpack the Singularity container config.config.unpack_singularity = True - AllPairs.run( - nodetypes=str(TEST_DIR / 'input/sample-in-nodetypes.txt'), - network=str(TEST_DIR / 'input/sample-in-net.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(out_path), - container_framework="singularity") + AllPairs.run({"nodetypes": str(TEST_DIR / 'input/sample-in-nodetypes.txt'), + "network": str(TEST_DIR / 'input/sample-in-net.txt'), + "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, + output_file=str(out_path), + container_framework="singularity") config.config.unpack_singularity = False assert out_path.exists() @@ -104,12 +100,10 @@ def test_allpairs_correctness(self): out_path = OUT_DIR / 'correctness-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'correctness-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'correctness-network.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=str(OUT_DIR / 'correctness-out.txt') - ) + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'correctness-nodetypes.txt', + "network": TEST_DIR / 'input' / 'correctness-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-false.txt'}, + output_file=OUT_DIR / 'correctness-out.txt') edge_equality_test_util(out_path, EXPECTED_DIR / 'correctness-expected.txt') @@ -117,12 +111,10 @@ def test_allpairs_directed(self): out_path = OUT_DIR / 'directed-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=str(TEST_DIR / 'input' / 'directed-nodetypes.txt'), - network=str(TEST_DIR / 'input' / 'directed-network.txt'), - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-true.txt'), - output_file=str(OUT_DIR / 'directed-out.txt'), - ) + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'directed-nodetypes.txt', + "network": TEST_DIR / 'input' / 'directed-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-true.txt'}, + output_file=OUT_DIR / 'directed-out.txt') edge_equality_test_util(out_path, EXPECTED_DIR.joinpath('directed-expected.txt')) @@ -136,11 +128,10 @@ def test_allpairs_zero_length(self): out_path = OUT_DIR / 'zero-length-out.txt' out_path.unlink(missing_ok=True) - AllPairs.run( - nodetypes=TEST_DIR / 'input' / 'zero-length-nodetypes.txt', - network=TEST_DIR / 'input' / 'zero-length-network.txt', - directed_flag=str(TEST_DIR / 'input' / 'directed-flag-false.txt'), - output_file=OUT_DIR / 'zero-length-out.txt' + AllPairs.run({"nodetypes": TEST_DIR / 'input' / 'zero-length-nodetypes.txt', + "network": TEST_DIR / 'input' / 'zero-length-network.txt', + "directed_flag": TEST_DIR / 'input' / 'directed-flag-false.txt'}, + output_file=OUT_DIR / 'zero-length-out.txt' ) assert filecmp.cmp(OUT_DIR / 'zero-length-out.txt', EXPECTED_DIR / 'zero-length-expected.txt', shallow=False) diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index d4a458b3c..c65ce4a32 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -25,22 +25,19 @@ class TestBowTieBuilder: def test_btb_missing(self): with pytest.raises(ValueError): # No edges - BTB.run( - targets=Path(TEST_DIR, 'input', 'target.txt'), - sources=Path(TEST_DIR, 'input', 'source.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), + "sources": Path(TEST_DIR, 'input', 'source.txt')}, + output_file=OUT_FILE_DEFAULT) with pytest.raises(ValueError): # No source - BTB.run( - targets=Path(TEST_DIR, 'input', 'target.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), + "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + output_file=OUT_FILE_DEFAULT) with pytest.raises(ValueError): # No target - BTB.run( - sources=Path(TEST_DIR, 'input', 'source.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'source.txt'), + "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ @@ -48,30 +45,30 @@ def test_btb_missing(self): """ def test_btb_file(self): with pytest.raises(ValueError): - BTB.run(sources=Path(TEST_DIR, 'input', 'unknown.txt'), - targets=Path(TEST_DIR, 'input', 'target.txt'), - edges=Path(TEST_DIR, 'input', 'edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'unknown.txt'), + "targets": Path(TEST_DIR, 'input', 'target.txt'), + "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ Run the BowTieBuilder algorithm with bad input data """ def test_format_error(self): with pytest.raises(IndexError): - BTB.run(sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - edges=Path(TEST_DIR, 'input', 'bad-edges.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt'), + "edges": Path(TEST_DIR, 'input', 'bad-edges.txt')}, + output_file=OUT_FILE_DEFAULT) """ Run the BowTieBuilder algorithm on the example input files and check the output matches the expected output """ def test_btb(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'btb-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'btb-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'btb-output.txt') @@ -89,10 +86,10 @@ def test_btb(self): """ def test_disjoint(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'disjoint-edges.txt'), - sources=Path(TEST_DIR, 'input', 'disjoint-sources.txt'), - targets=Path(TEST_DIR, 'input', 'disjoint-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'disjoint-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'disjoint-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'disjoint-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'disjoint-output.txt') @@ -110,10 +107,10 @@ def test_disjoint(self): """ def test_disjoint2(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'disjoint2-edges.txt'), - sources=Path(TEST_DIR, 'input', 'disjoint-sources.txt'), - targets=Path(TEST_DIR, 'input', 'disjoint-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'disjoint2-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'disjoint-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'disjoint-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'disjoint-output.txt') @@ -132,10 +129,10 @@ def test_disjoint2(self): def test_missing_file(self): with pytest.raises(ValueError): with pytest.raises(OSError): - BTB.run(edges=Path(TEST_DIR, 'input', 'missing.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'missing.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) """ @@ -143,10 +140,10 @@ def test_missing_file(self): """ def test_source_to_source(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source-output.txt') @@ -164,10 +161,10 @@ def test_source_to_source(self): """ def test_source_to_source2(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source2-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source2-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source2-output.txt') @@ -186,10 +183,10 @@ def test_source_to_source2(self): def test_source_to_source_disjoint(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source-disjoint-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source-disjoint-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'source-to-source-disjoint-output.txt') @@ -208,10 +205,10 @@ def test_source_to_source_disjoint(self): def test_bidirectional(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'bidirectional-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'bidirectional-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'bidirectional-output.txt') @@ -230,10 +227,10 @@ def test_bidirectional(self): def test_target_to_source(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'target-to-source-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'target-to-source-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'empty-output.txt') @@ -252,10 +249,10 @@ def test_target_to_source(self): def test_loop(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'loop-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'loop-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'loop-output.txt') @@ -274,10 +271,10 @@ def test_loop(self): def test_weighted(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'weighted-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'weighted-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'weighted-output.txt') @@ -292,10 +289,10 @@ def test_weighted(self): def test_weight_one(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'weight-one-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT) + BTB.run({"edges": Path(TEST_DIR, 'input', 'weight-one-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) assert OUT_FILE_DEFAULT.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected', 'weighted-output.txt') diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index 4323ea4c9..62563bdc3 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -5,7 +5,7 @@ import pytest import spras.config.config as config -from spras.domino import DOMINO, post_domino_id_transform, pre_domino_id_transform +from spras.domino import DOMINO, DominoParams, post_domino_id_transform, pre_domino_id_transform config.init_from_file("config/config.yaml") @@ -28,10 +28,9 @@ def test_domino_required(self): # Only include required arguments out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt', + "active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT) # output_file should be empty assert out_path.exists() @@ -39,12 +38,10 @@ def test_domino_optional(self): # Include optional arguments out_path = Path(OUT_FILE_OPTIONAL) out_path.unlink(missing_ok=True) - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_OPTIONAL, - slice_threshold=0.4, - module_threshold=0.06) + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt', + "active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_OPTIONAL, + args=DominoParams(slice_threshold=0.4, module_threshold=0.06)) # output_file should be empty assert out_path.exists() @@ -52,17 +49,15 @@ def test_domino_missing_active_genes(self): # Test the expected error is raised when active_genes argument is missing with pytest.raises(ValueError): # No active_genes - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt'}, + output_file=OUT_FILE_DEFAULT) def test_domino_missing_network(self): # Test the expected error is raised when network argument is missing with pytest.raises(ValueError): # No network - DOMINO.run( - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT) + DOMINO.run({"active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -71,11 +66,10 @@ def test_domino_singularity(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - DOMINO.run( - network=TEST_DIR+'input/domino-network.txt', - active_genes=TEST_DIR+'input/domino-active-genes.txt', - output_file=OUT_FILE_DEFAULT, - container_framework="singularity") + DOMINO.run({"network": TEST_DIR+'input/domino-network.txt', + "active_genes": TEST_DIR+'input/domino-active-genes.txt'}, + output_file=OUT_FILE_DEFAULT, + container_framework="singularity") assert out_path.exists() def test_pre_id_transform(self): diff --git a/test/MEO/test_meo.py b/test/MEO/test_meo.py index 32958be20..051744ed7 100644 --- a/test/MEO/test_meo.py +++ b/test/MEO/test_meo.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.meo import MEO, write_properties +from spras.meo import MEO, MEOParams, write_properties config.init_from_file("config/config.yaml") @@ -20,9 +20,9 @@ def test_meo_required(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE) assert out_path.exists() @@ -30,21 +30,19 @@ def test_meo_all_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', - output_file=OUT_FILE, - max_path_length=3, - local_search='No', - rand_restarts=10) + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, + args=MEOParams(max_path_length=3, local_search=False, rand_restarts=10), + output_file=OUT_FILE) assert out_path.exists() def test_meo_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No edges - MEO.run(sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE) with pytest.raises(ValueError): @@ -62,9 +60,9 @@ def test_meo_singularity(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - MEO.run(edges=TEST_DIR + 'input/meo-edges.txt', - sources=TEST_DIR + 'input/meo-sources.txt', - targets=TEST_DIR + 'input/meo-targets.txt', + MEO.run({"edges": TEST_DIR + 'input/meo-edges.txt', + "sources": TEST_DIR + 'input/meo-sources.txt', + "targets": TEST_DIR + 'input/meo-targets.txt'}, output_file=OUT_FILE, container_framework="singularity") assert out_path.exists() From 32d4b5cbce1e46a0afe5744b778350ab2b7cbae8 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 20:03:27 +0000 Subject: [PATCH 05/68] refactor: moving more tests --- spras/omicsintegrator1.py | 34 +++++----- spras/omicsintegrator2.py | 16 +++-- test/OmicsIntegrator1/test_oi1.py | 109 +++++++++++++++--------------- test/OmicsIntegrator2/test_oi2.py | 43 +++++------- 4 files changed, 97 insertions(+), 105 deletions(-) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 3361f5d2a..45465ecd6 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -7,7 +7,7 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['OmicsIntegrator1', 'write_conf'] +__all__ = ['OmicsIntegrator1', 'OmicsIntegrator1Params', 'write_conf'] # TODO decide on default number of processes and threads @@ -38,41 +38,41 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi f.write('threads = 1\n') class OmicsIntegrator1Params(BaseModel): - dummy_mode: Optional[str] - mu_squared: Optional[str] - exclude_terms: Optional[str] + dummy_mode: Optional[str] = None + mu_squared: Optional[bool] = None + exclude_terms: Optional[bool] = None - noisy_edges: Optional[str] + noisy_edges: Optional[int] = None "How many times you would like to add noise to the given edge values and re-run the algorithm." - shuffled_prizes: Optional[int] + shuffled_prizes: Optional[int] = None "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run" - random_terminals: Optional[int] + random_terminals: Optional[int] = None "How many times to apply the given prizes to random nodes in the interactome" - seed: Optional[str] + seed: Optional[int] = None "the randomness seed to use" - w: Optional[float] + w: int "the number of trees" - b: Optional[str] + b: float "the trade-off between including more terminals and using less reliable edges" - d: Optional[str] + d: int "controls the maximum path-length from v0 to terminal nodes" - mu: Optional[float] + mu: Optional[float] = None "controls the degree-based negative prizes (defualt 0.0)" - noise: Optional[str] + noise: Optional[float] = None "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations" - g: Optional[str] + g: Optional[float] = None "(Gamma) multiplicative edge penalty from degree of endpoints" - r: Optional[str] + r: Optional[float] = None "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)" model_config = ConfigDict(use_attribute_docstrings=True) @@ -142,7 +142,7 @@ def generate_inputs(data, filename_map): # TODO document required arguments @staticmethod def run(inputs, output_file, args, container_framework="docker"): - if inputs["edges"] is None or inputs["prizes"] is None or output_file is None or w is None or b is None or d is None: + if inputs["edges"] is None or inputs["prizes"] is None or output_file is None: raise ValueError('Required Omics Integrator 1 arguments are missing') work_dir = '/spras' @@ -195,7 +195,7 @@ def run(inputs, output_file, args, container_framework="docker"): if args.dummy_mode is not None and args.dummy_mode: # for custom dummy modes, add the file if args.dummy_mode == 'file': - command.extend(['--dummyMode', inputs["dummy_file"]]) + command.extend(['--dummyMode', str(inputs["dummy_file"])]) # else pass in the dummy_mode and let oi1 handle it else: command.extend(['--dummyMode', args.dummy_mode]) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 20351833e..944bf1bf7 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -10,7 +10,7 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges -__all__ = ['OmicsIntegrator2'] +__all__ = ['OmicsIntegrator2', 'OmicsIntegrator2Params'] class OmicsIntegrator2Params(BaseModel): w: float = 6 @@ -22,16 +22,16 @@ class OmicsIntegrator2Params(BaseModel): g: float = 20 "Gamma: multiplicative edge penalty from degree of endpoints" - noise: Optional[str] + noise: Optional[float] = None "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations." - noisy_edges: Optional[int] + noisy_edges: Optional[int] = None "An integer specifying how many times to add noise to the given edge values and re-run." - random_terminals: Optional[str] + random_terminals: Optional[int] = None "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run" - dummy_mode: Optional[str] + dummy_mode: Optional[str] = None """ Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals) "terminals" = connect to all terminals @@ -39,9 +39,11 @@ class OmicsIntegrator2Params(BaseModel): "all" = connect to all nodes in the interactome. """ - seed: Optional[str] + seed: Optional[int] = None "The random seed to use for this run." + model_config = ConfigDict(use_attribute_docstrings=True) + """ Omics Integrator 2 will construct a fully undirected graph from the provided input file - in the algorithm, it uses nx.Graph() objects, which are undirected @@ -101,7 +103,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(inputs, output_file, args, container_framework="docker"): + def run(inputs, output_file, args=OmicsIntegrator2Params(), container_framework="docker"): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index a484c0af3..fad4627e0 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.omicsintegrator1 import OmicsIntegrator1, write_conf +from spras.omicsintegrator1 import OmicsIntegrator1, OmicsIntegrator1Params, write_conf config.init_from_file("config/config.yaml") @@ -20,79 +20,74 @@ def test_oi1_required(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10) + args=OmicsIntegrator1Params(w=5, b=1, d=10)) assert out_path.exists() def test_oi1_some_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include optional argument - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params(w=5, b=1, d=10, noise=0.333, g=0.001, r=0)) assert out_path.exists() def test_oi1_all_optional(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', - dummy_nodes=None, - dummy_mode='terminals', - mu_squared=True, - exclude_terms=True, + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, - noisy_edges=0, - shuffled_prizes=0, - random_terminals=0, - seed=1, - w=5, - b=1, - d=10, - mu=0, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params( + dummy_mode='terminals', + mu_squared=True, + exclude_terms=True, + noisy_edges=0, + shuffled_prizes=0, + random_terminals=0, + seed=1, + w=5, + b=1, + d=10, + mu=0, + noise=0.333, + g=0.001, + r=0)) assert out_path.exists() def test_oi1_dummy_file(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include optional argument - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR+'input/oi1-prizes.txt', - dummy_nodes=TEST_DIR + 'input/oi1-dummy.txt', - dummy_mode='file', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR+'input/oi1-prizes.txt', + "dummy_nodes": TEST_DIR + 'input/oi1-dummy.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, - noise=0.333, - g=0.001, - r=0) + args=OmicsIntegrator1Params( + dummy_mode='file', + w=5, + b=1, + d=10, + noise=0.333, + g=0.001, + r=0)) assert out_path.exists() def test_oi1_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No edges - OmicsIntegrator1.run(prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=TEST_DIR+'output/test_optimalForest.sif', - w=5, - b=1, - d=10) + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10)) with pytest.raises(ValueError): # No w write_conf(Path('.'), @@ -103,13 +98,14 @@ def test_oi1_missing_dummy(self): # Test the expected error is raised when the dummy_nodes file is missing and the dummy_mode is 'file' with pytest.raises(ValueError): # No edges - OmicsIntegrator1.run(edges=TEST_DIR+'input/oi1-edges.txt', - prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR+'input/oi1-edges.txt', + "prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=TEST_DIR+'output/test_optimalForest.sif', - w=5, - b=1, - d=10, - dummy_mode='file') + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10, + dummy_mode='file')) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -118,11 +114,12 @@ def test_oi1_singularity(self): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - OmicsIntegrator1.run(edges=TEST_DIR + 'input/oi1-edges.txt', - prizes=TEST_DIR + 'input/oi1-prizes.txt', + OmicsIntegrator1.run({"edges": TEST_DIR + 'input/oi1-edges.txt', + "prizes": TEST_DIR + 'input/oi1-prizes.txt'}, output_file=OUT_FILE, - w=5, - b=1, - d=10, + args=OmicsIntegrator1Params( + w=5, + b=1, + d=10), container_framework="singularity") assert out_path.exists() diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index 13f7f30b6..0239d5e5f 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.omicsintegrator2 import OmicsIntegrator2 +from spras.omicsintegrator2 import OmicsIntegrator2, OmicsIntegrator2Params config.init_from_file("config/config.yaml") @@ -21,51 +21,44 @@ class TestOmicsIntegrator2: def test_oi2_required(self): # Only include required arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE) assert OUT_FILE.exists() def test_oi2_some_optional(self): # Include optional argument OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, - g=0) + args=OmicsIntegrator2Params(g=0)) assert OUT_FILE.exists() def test_oi2_all_optional(self): # Include all optional arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, - w=5, - b=1, - g=3, - noise=0.1, - noisy_edges=0, - random_terminals=0, - dummy_mode='terminals', - seed=2) + args=OmicsIntegrator2Params(w=5, + b=1, + g=3, + noise=0.1, + noisy_edges=0, + random_terminals=0, + dummy_mode='terminals', + seed=2)) assert OUT_FILE.exists() - def test_oi2_missing(self): - # Test the expected error is raised when required arguments are missing - with pytest.raises(ValueError): - # No output_file - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE) - # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') def test_oi2_singularity(self): # Only include required arguments OUT_FILE.unlink(missing_ok=True) - OmicsIntegrator2.run(edges=EDGE_FILE, - prizes=PRIZE_FILE, + OmicsIntegrator2.run({"edges": EDGE_FILE, + "prizes": PRIZE_FILE}, output_file=OUT_FILE, container_framework="singularity") assert OUT_FILE.exists() From 9b539e99fe7f4f53549e4e295fe0d3bf6bce39ed Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 20:28:50 +0000 Subject: [PATCH 06/68] fix: correct params --- spras/config/util.py | 3 +- spras/domino.py | 2 +- spras/meo.py | 5 ++- spras/mincostflow.py | 5 ++- spras/omicsintegrator1.py | 3 +- spras/omicsintegrator2.py | 2 +- spras/pathlinker.py | 9 +++-- spras/prm.py | 5 ++- spras/rwr.py | 11 +++-- spras/strwr.py | 12 ++++-- test/DOMINO/test_domino.py | 7 +++- test/MinCostFlow/test_mcf.py | 65 ++++++++++++++---------------- test/OmicsIntegrator2/test_oi2.py | 9 ++--- test/PathLinker/test_pathlinker.py | 36 +++++++---------- test/RWR/test_RWR.py | 26 ++++++------ test/ST_RWR/test_STRWR.py | 34 ++++++++-------- 16 files changed, 120 insertions(+), 114 deletions(-) diff --git a/spras/config/util.py b/spras/config/util.py index c23374a50..32f19076f 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -1,7 +1,8 @@ from enum import Enum -from pydantic import BaseModel, ConfigDict from typing import Any +from pydantic import BaseModel, ConfigDict + # https://stackoverflow.com/a/76883868/7589775 class CaseInsensitiveEnum(str, Enum): diff --git a/spras/domino.py b/spras/domino.py index 187e53836..86f3c0563 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -1,9 +1,9 @@ import json from pathlib import Path +from typing import Optional import pandas as pd from pydantic import BaseModel, ConfigDict -from typing import Optional from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( diff --git a/spras/meo.py b/spras/meo.py index 0451cb4c0..30e81d87e 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -1,8 +1,9 @@ import os from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional +from pydantic import BaseModel, ConfigDict + from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( add_directionality_constant, @@ -78,7 +79,7 @@ class MEOParams(BaseModel): See "Improving approximations with local search" in the associated paper for more information. """ - + rand_restarts: Optional[int] = None "The number of random restarts to do." diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 986c1c8eb..eab80c631 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -1,7 +1,8 @@ from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional +from pydantic import BaseModel, ConfigDict + from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, @@ -10,7 +11,7 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['MinCostFlow'] +__all__ = ['MinCostFlow', 'MinCostFlowParams'] class MinCostFlowParams(BaseModel): flow: Optional[float] = None diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 45465ecd6..d8226f735 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,7 +1,8 @@ from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional +from pydantic import BaseModel, ConfigDict + from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed from spras.prm import PRM diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 944bf1bf7..41aec9ee1 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -1,8 +1,8 @@ from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional import pandas as pd +from pydantic import BaseModel, ConfigDict from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 3c78ffb84..167403cef 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -1,8 +1,9 @@ import warnings from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional +from pydantic import BaseModel, ConfigDict + from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( @@ -12,10 +13,10 @@ from spras.prm import PRM from spras.util import duplicate_edges, raw_pathway_df -__all__ = ['PathLinker'] +__all__ = ['PathLinker', 'PathLinkerParams'] class PathLinkerParams(BaseModel): - k: Optional[int] + k: Optional[int] = None "path length (optional)" model_config = ConfigDict(use_attribute_docstrings=True) @@ -75,7 +76,7 @@ def generate_inputs(data, filename_map): header=["#Interactor1","Interactor2","Weight"]) @staticmethod - def run(inputs, output_file, args, container_framework="docker"): + def run(inputs, output_file, args=PathLinkerParams(), container_framework="docker"): """ Run PathLinker with Docker @param nodetypes: input node types with sources and targets (required) diff --git a/spras/prm.py b/spras/prm.py index 1692f11f6..73c94454a 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,7 +1,8 @@ +import os from abc import ABC, abstractmethod +from typing import Any, Generic, TypeVar, cast + from pydantic import BaseModel -from typing import Any, cast, TypeVar, Generic -import os from spras.dataset import Dataset diff --git a/spras/rwr.py b/spras/rwr.py index 12df71e01..ba78589ec 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -1,8 +1,8 @@ from pathlib import Path -from pydantic import BaseModel, ConfigDict from typing import Optional import pandas as pd +from pydantic import BaseModel, ConfigDict from spras.containers import prepare_volume, run_container from spras.dataset import Dataset @@ -10,11 +10,14 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['RWR'] +__all__ = ['RWR', 'RWRParams'] class RWRParams(BaseModel): - threshold: Optional[int] - alpha: Optional[float] + threshold: int + "The number of nodes to return" + + alpha: Optional[float] = None + "The chance of a restart during the random walk" model_config = ConfigDict(use_attribute_docstrings=True) diff --git a/spras/strwr.py b/spras/strwr.py index c603f9196..37590e7c6 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -1,7 +1,8 @@ from pathlib import Path +from typing import Optional + import pandas as pd from pydantic import BaseModel, ConfigDict -from typing import Optional from spras.containers import prepare_volume, run_container from spras.dataset import Dataset @@ -9,11 +10,14 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['ST_RWR'] +__all__ = ['ST_RWR', 'ST_RWRParams'] class ST_RWRParams(BaseModel): - threshold: Optional[int] - alpha: Optional[float] + threshold: int + "The number of nodes to return" + + alpha: Optional[float] = None + "The chance of a restart during the random walk" model_config = ConfigDict(use_attribute_docstrings=True) diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index 62563bdc3..e84c0df8b 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -5,7 +5,12 @@ import pytest import spras.config.config as config -from spras.domino import DOMINO, DominoParams, post_domino_id_transform, pre_domino_id_transform +from spras.domino import ( + DOMINO, + DominoParams, + post_domino_id_transform, + pre_domino_id_transform, +) config.init_from_file("config/config.yaml") diff --git a/test/MinCostFlow/test_mcf.py b/test/MinCostFlow/test_mcf.py index c777a665d..1c9c61a60 100644 --- a/test/MinCostFlow/test_mcf.py +++ b/test/MinCostFlow/test_mcf.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.mincostflow import MinCostFlow +from spras.mincostflow import MinCostFlow, MinCostFlowParams config.init_from_file("config/config.yaml") @@ -21,9 +21,9 @@ def test_mincostflow_required(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE) assert out_path.exists() # TODO: assert for the output .equals expected_output instead of only testing @@ -34,11 +34,11 @@ def test_mincostflow_missing_capacity(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1) + args=MinCostFlowParams(flow=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -46,11 +46,11 @@ def test_mincostflow_missing_flow(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - capacity=1) + args=MinCostFlowParams(capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -59,24 +59,22 @@ def test_mincostflow_too_much_flow(self, graph): out_path.unlink(missing_ok=True) with pytest.raises(RuntimeError): - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=50, - capacity=1) + args=MinCostFlowParams(flow=50, capacity=1)) @pytest.mark.parametrize('graph', ['graph1']) def test_mincostflow_no_flow(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=0, - capacity=1) + args=MinCostFlowParams(flow=0, capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) @@ -84,20 +82,19 @@ def test_mincostflow_all_optional(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1, - capacity=1) + args=MinCostFlowParams(flow=1, capacity=1)) assert out_path.exists() @pytest.mark.parametrize('graph', ['graph1']) def test_mincostflow_missing(self, graph): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt'}, output_file=OUT_FILE) @pytest.mark.parametrize('graph', ['graph1']) @@ -106,12 +103,10 @@ def test_mincostflow_singularity(self, graph): out_path = Path(OUT_FILE) out_path.unlink(missing_ok=True) # Include all optional arguments - MinCostFlow.run(sources=TEST_DIR + 'input/' + graph + '/sources.txt', - targets=TEST_DIR + 'input/' + graph + '/targets.txt', - edges=TEST_DIR + 'input/' + graph + '/edges.txt', + MinCostFlow.run({"sources": TEST_DIR + 'input/' + graph + '/sources.txt', + "targets": TEST_DIR + 'input/' + graph + '/targets.txt', + "edges": TEST_DIR + 'input/' + graph + '/edges.txt'}, output_file=OUT_FILE, - flow=1, - capacity=1, + args=MinCostFlowParams(flow=1, capacity=1), container_framework="singularity") assert out_path.exists() - diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index 0239d5e5f..172197efd 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -8,11 +8,10 @@ config.init_from_file("config/config.yaml") -TEST_DIR = 'test/OmicsIntegrator2/' -EDGE_FILE = TEST_DIR+'input/oi2-edges.txt' -PRIZE_FILE = TEST_DIR+'input/oi2-prizes.txt' -OUT_FILE = Path(TEST_DIR, 'output', 'test.tsv') - +TEST_DIR = Path('test', 'OmicsIntegrator2') +EDGE_FILE = TEST_DIR / 'input' / 'oi2-edges.txt' +PRIZE_FILE = TEST_DIR / 'input' / 'oi2-prizes.txt' +OUT_FILE = TEST_DIR / 'output' / 'test.tsv' class TestOmicsIntegrator2: """ diff --git a/test/PathLinker/test_pathlinker.py b/test/PathLinker/test_pathlinker.py index ed9f10670..67e4b598f 100644 --- a/test/PathLinker/test_pathlinker.py +++ b/test/PathLinker/test_pathlinker.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.pathlinker import PathLinker +from spras.pathlinker import PathLinker, PathLinkerParams config.init_from_file("config/config.yaml") @@ -21,33 +21,28 @@ def test_pathlinker_required(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_DEFAULT - ) + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_DEFAULT) assert out_path.exists() def test_pathlinker_optional(self): out_path = Path(OUT_FILE_100) out_path.unlink(missing_ok=True) # Include optional argument - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_100, - k=100 - ) + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_100, + args=PathLinkerParams(k=100)) assert out_path.exists() def test_pathlinker_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No nodetypes - PathLinker.run( - network=TEST_DIR + 'input/sample-in-net.txt', - output_file=OUT_FILE_100, - k=100) + PathLinker.run({"network": TEST_DIR + 'input/sample-in-net.txt'}, + output_file=OUT_FILE_100, + args=PathLinkerParams(k=100)) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms @@ -56,9 +51,8 @@ def test_pathlinker_singularity(self): out_path = Path(OUT_FILE_DEFAULT) out_path.unlink(missing_ok=True) # Only include required arguments and run with Singularity - PathLinker.run( - nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', - network=TEST_DIR+'input/sample-in-net.txt', - output_file=OUT_FILE_DEFAULT, - container_framework="singularity") + PathLinker.run({"nodetypes": TEST_DIR+'input/sample-in-nodetypes.txt', + "network": TEST_DIR+'input/sample-in-net.txt'}, + output_file=OUT_FILE_DEFAULT, + container_framework="singularity") assert out_path.exists() diff --git a/test/RWR/test_RWR.py b/test/RWR/test_RWR.py index b0316ded0..70eb06845 100644 --- a/test/RWR/test_RWR.py +++ b/test/RWR/test_RWR.py @@ -5,7 +5,7 @@ import pytest import spras.config.config as config -from spras.rwr import RWR +from spras.rwr import RWR, RWRParams config.init_from_file("config/config.yaml") @@ -19,9 +19,9 @@ class TestRWR: """ def test_rwr(self): OUT_FILE.unlink(missing_ok=True) - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) assert OUT_FILE.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected_output', 'rwr-output.txt') @@ -32,9 +32,9 @@ def test_rwr(self): """ def test_missing_file(self): with pytest.raises(OSError): - RWR.run(network=Path(TEST_DIR, 'input', 'missing.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'missing.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) """ @@ -42,9 +42,9 @@ def test_missing_file(self): """ def test_format_error(self): with pytest.raises(ValueError): - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-bad-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-bad-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) # Only run Singularity test if the binary is available on the system @@ -53,9 +53,9 @@ def test_format_error(self): def test_rwr_singularity(self): OUT_FILE.unlink(missing_ok=True) # Only include required arguments and run with Singularity - RWR.run(network=Path(TEST_DIR, 'input', 'rwr-network.txt'), - nodes=Path(TEST_DIR, 'input','rwr-nodes.txt'), - alpha=0.85, + RWR.run({"network": Path(TEST_DIR, 'input', 'rwr-network.txt'), + "nodes": Path(TEST_DIR, 'input','rwr-nodes.txt')}, + args=RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE, container_framework="singularity") assert OUT_FILE.exists() diff --git a/test/ST_RWR/test_STRWR.py b/test/ST_RWR/test_STRWR.py index 898b24055..ea0c2bda0 100644 --- a/test/ST_RWR/test_STRWR.py +++ b/test/ST_RWR/test_STRWR.py @@ -5,7 +5,7 @@ import pytest import spras.config.config as config -from spras.strwr import ST_RWR +from spras.strwr import ST_RWR, ST_RWRParams config.init_from_file("config/config.yaml") @@ -20,10 +20,10 @@ class TestSTRWR: """ def test_strwr(self): OUT_FILE.unlink(missing_ok=True) - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) assert OUT_FILE.exists(), 'Output file was not written' expected_file = Path(TEST_DIR, 'expected_output', 'strwr-output.txt') @@ -34,10 +34,10 @@ def test_strwr(self): """ def test_missing_file(self): with pytest.raises(OSError): - ST_RWR.run(network=Path(TEST_DIR, 'input', 'missing.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'missing.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) """ @@ -45,10 +45,10 @@ def test_missing_file(self): """ def test_format_error(self): with pytest.raises(ValueError): - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-bad-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-bad-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE) # Only run Singularity test if the binary is available on the system @@ -57,10 +57,10 @@ def test_format_error(self): def test_strwr_singularity(self): OUT_FILE.unlink(missing_ok=True) # Only include required arguments and run with Singularity - ST_RWR.run(network=Path(TEST_DIR, 'input', 'strwr-network.txt'), - sources=Path(TEST_DIR, 'input', 'strwr-sources.txt'), - targets=Path(TEST_DIR, 'input','strwr-targets.txt'), - alpha=0.85, + ST_RWR.run({"network": Path(TEST_DIR, 'input', 'strwr-network.txt'), + "sources": Path(TEST_DIR, 'input', 'strwr-sources.txt'), + "targets": Path(TEST_DIR, 'input','strwr-targets.txt')}, + args=ST_RWRParams(alpha=0.85, threshold=200), output_file=OUT_FILE, container_framework="singularity") assert OUT_FILE.exists() From da6771166f36fb9d1a1f9d8b651296de02546fd1 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 20:47:08 +0000 Subject: [PATCH 07/68] fix: specify default args out of run --- spras/allpairs.py | 2 +- spras/btb.py | 2 +- spras/domino.py | 7 +++++-- spras/meo.py | 5 ++++- spras/mincostflow.py | 5 ++++- spras/omicsintegrator1.py | 8 ++++---- spras/omicsintegrator2.py | 5 ++++- spras/pathlinker.py | 17 +++++++---------- 8 files changed, 30 insertions(+), 21 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index 15a3b17f7..670d3f721 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -72,7 +72,7 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, output_file, args=Empty(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): """ Run All Pairs Shortest Paths with Docker @param nodetypes: input node types with sources and targets (required) diff --git a/spras/btb.py b/spras/btb.py index 6ad3afb69..81474bdb2 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -65,7 +65,7 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, output_file, args=Empty(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide diff --git a/spras/domino.py b/spras/domino.py index 86f3c0563..16a70a788 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -76,7 +76,10 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(inputs, output_file, args=DominoParams(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): + if not args: + args = DominoParams() + # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. if not inputs["network"] or not inputs["active_genes"]: raise ValueError('Required DOMINO arguments are missing') @@ -152,7 +155,7 @@ def run(inputs, output_file, args=DominoParams(), container_framework="docker"): # Clean up DOMINO intermediate and pickle files slices_file.unlink(missing_ok=True) Path(out_dir, 'network.slices.pkl').unlink(missing_ok=True) - Path(network + '.pkl').unlink(missing_ok=True) + Path(f"{inputs['network']}.pkl").unlink(missing_ok=True) @staticmethod def parse_output(raw_pathway_file, standardized_pathway_file, params): diff --git a/spras/meo.py b/spras/meo.py index 30e81d87e..02edf07af 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -145,7 +145,7 @@ def generate_inputs(data, filename_map): # TODO add parameter validation # TODO document required arguments @staticmethod - def run(inputs, args=MEOParams(), output_file=None, container_framework="docker"): + def run(inputs, output_file=None, args=None, container_framework="docker"): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -154,6 +154,9 @@ def run(inputs, args=MEOParams(), output_file=None, container_framework="docker" Only the edge output file is retained. All other output files are deleted. """ + if not args: + args = MEOParams() + if inputs["edges"] is None or inputs["sources"] is None or inputs["targets"] is None: raise ValueError('Required Maximum Edge Orientation arguments are missing') diff --git a/spras/mincostflow.py b/spras/mincostflow.py index eab80c631..b2267f800 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -72,7 +72,10 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, args=MinCostFlowParams(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): + if not args: + args = MinCostFlowParams() + # ensures that these parameters are required if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required MinCostFlow arguments are missing') diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index d8226f735..9152e80a6 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -40,8 +40,8 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi class OmicsIntegrator1Params(BaseModel): dummy_mode: Optional[str] = None - mu_squared: Optional[bool] = None - exclude_terms: Optional[bool] = None + mu_squared: bool = False + exclude_terms: bool = False noisy_edges: Optional[int] = None "How many times you would like to add noise to the given edge values and re-run the algorithm." @@ -202,9 +202,9 @@ def run(inputs, output_file, args, container_framework="docker"): command.extend(['--dummyMode', args.dummy_mode]) # Add optional arguments - if args.mu_squared is not None and args.mu_squared: + if args.mu_squared: command.extend(['--musquared']) - if args.exclude_terms is not None and args.exclude_terms: + if args.exclude_terms: command.extend(['--excludeTerms']) if args.noisy_edges is not None: command.extend(['--noisyEdges', str(args.noisy_edges)]) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 41aec9ee1..fb420de8e 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -103,7 +103,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(inputs, output_file, args=OmicsIntegrator2Params(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. @@ -111,6 +111,9 @@ def run(inputs, output_file, args=OmicsIntegrator2Params(), container_framework= @param output_file: the name of the output file, which will overwrite any existing file with this name @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ + if not args: + args = OmicsIntegrator2Params() + if inputs["edges"] is None or inputs["prizes"] is None: raise ValueError('Required Omics Integrator 2 arguments are missing') diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 167403cef..d5ac385f4 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -16,8 +16,8 @@ __all__ = ['PathLinker', 'PathLinkerParams'] class PathLinkerParams(BaseModel): - k: Optional[int] = None - "path length (optional)" + k: int = 100 + "path length" model_config = ConfigDict(use_attribute_docstrings=True) @@ -76,7 +76,7 @@ def generate_inputs(data, filename_map): header=["#Interactor1","Interactor2","Weight"]) @staticmethod - def run(inputs, output_file, args=PathLinkerParams(), container_framework="docker"): + def run(inputs, output_file, args=None, container_framework="docker"): """ Run PathLinker with Docker @param nodetypes: input node types with sources and targets (required) @@ -85,10 +85,9 @@ def run(inputs, output_file, args=PathLinkerParams(), container_framework="docke @param k: @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ - # Add additional parameter validation - # Do not require k - # Use the PathLinker default - # Could consider setting the default here instead + if not args: + args = PathLinkerParams() + if not inputs["nodetypes"] or not inputs["network"]: raise ValueError('Required PathLinker arguments are missing') @@ -118,9 +117,7 @@ def run(inputs, output_file, args=PathLinkerParams(), container_framework="docke node_file, '--output', mapped_out_prefix] - # Add optional argument - if args.k is not None: - command.extend(['-k', str(args.k)]) + command.extend(['-k', str(args.k)]) container_suffix = "pathlinker:v2" run_container_and_log('PathLinker', From 45cfe87a46a6850a3f1ce6d890b25c3b47eff781 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 21:03:20 +0000 Subject: [PATCH 08/68] fix: more defaults --- spras/domino.py | 2 +- spras/mincostflow.py | 2 +- spras/omicsintegrator1.py | 16 ++++++---------- spras/omicsintegrator2.py | 10 +++++----- spras/pathlinker.py | 8 -------- 5 files changed, 13 insertions(+), 25 deletions(-) diff --git a/spras/domino.py b/spras/domino.py index 16a70a788..110b11ab3 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -79,7 +79,7 @@ def generate_inputs(data, filename_map): def run(inputs, output_file, args=None, container_framework="docker"): if not args: args = DominoParams() - + # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. if not inputs["network"] or not inputs["active_genes"]: raise ValueError('Required DOMINO arguments are missing') diff --git a/spras/mincostflow.py b/spras/mincostflow.py index b2267f800..2673d91e2 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -75,7 +75,7 @@ def generate_inputs(data, filename_map): def run(inputs, output_file, args=None, container_framework="docker"): if not args: args = MinCostFlowParams() - + # ensures that these parameters are required if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required MinCostFlow arguments are missing') diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 9152e80a6..74f55bff7 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -43,13 +43,13 @@ class OmicsIntegrator1Params(BaseModel): mu_squared: bool = False exclude_terms: bool = False - noisy_edges: Optional[int] = None + noisy_edges: int = 0 "How many times you would like to add noise to the given edge values and re-run the algorithm." - shuffled_prizes: Optional[int] = None + shuffled_prizes: int = 0 "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run" - random_terminals: Optional[int] = None + random_terminals: int = 0 "How many times to apply the given prizes to random nodes in the interactome" seed: Optional[int] = None @@ -140,7 +140,6 @@ def generate_inputs(data, filename_map): # TODO add support for knockout argument # TODO add reasonable default values - # TODO document required arguments @staticmethod def run(inputs, output_file, args, container_framework="docker"): if inputs["edges"] is None or inputs["prizes"] is None or output_file is None: @@ -206,12 +205,9 @@ def run(inputs, output_file, args, container_framework="docker"): command.extend(['--musquared']) if args.exclude_terms: command.extend(['--excludeTerms']) - if args.noisy_edges is not None: - command.extend(['--noisyEdges', str(args.noisy_edges)]) - if args.shuffled_prizes is not None: - command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) - if args.random_terminals is not None: - command.extend(['--randomTerminals', str(args.random_terminals)]) + command.extend(['--noisyEdges', str(args.noisy_edges)]) + command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) + command.extend(['--randomTerminals', str(args.random_terminals)]) if args.seed is not None: command.extend(['--seed', str(args.seed)]) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index fb420de8e..f0a2d9c52 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -1,8 +1,9 @@ +import time from pathlib import Path from typing import Optional import pandas as pd -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -39,8 +40,8 @@ class OmicsIntegrator2Params(BaseModel): "all" = connect to all nodes in the interactome. """ - seed: Optional[int] = None - "The random seed to use for this run." + seed: int = Field(default_factory=lambda _: int(time.time() * 1000)) + "The random seed to use for this run. Defaults to the current UNIX timestamp." model_config = ConfigDict(use_attribute_docstrings=True) @@ -153,8 +154,7 @@ def run(inputs, output_file, args=None, container_framework="docker"): if args.dummy_mode is not None: # This argument does not follow the other naming conventions command.extend(['--dummyMode', str(args.dummy_mode)]) - if args.seed is not None: - command.extend(['--seed', str(args.seed)]) + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-2:v2" run_container_and_log('Omics Integrator 2', diff --git a/spras/pathlinker.py b/spras/pathlinker.py index d5ac385f4..9b6fe964c 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -77,14 +77,6 @@ def generate_inputs(data, filename_map): @staticmethod def run(inputs, output_file, args=None, container_framework="docker"): - """ - Run PathLinker with Docker - @param nodetypes: input node types with sources and targets (required) - @param network: input network file (required) - @param output_file: path to the output pathway file (required) - @param k: - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ if not args: args = PathLinkerParams() From e0808570331316b5dbfd5af5bc4d2f4702635bb8 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 22:13:13 +0000 Subject: [PATCH 09/68] feat: begin algorithm parsing --- config/config.yaml | 4 +-- spras/config/algorithms.py | 63 ++++++++++++++++++++++++++++++++++++++ spras/config/schema.py | 21 ++----------- spras/runner.py | 43 ++++++++++++++------------ 4 files changed, 91 insertions(+), 40 deletions(-) create mode 100644 spras/config/algorithms.py diff --git a/config/config.yaml b/config/config.yaml index 8092b9eb9..5d23946d4 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -81,8 +81,8 @@ algorithms: rand_restarts: 10 - name: "mincostflow" - params: - include: true + include: true + runs: run1: flow: 1 # The flow must be an int capacity: 1 diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py new file mode 100644 index 000000000..9e78788f4 --- /dev/null +++ b/spras/config/algorithms.py @@ -0,0 +1,63 @@ +""" +Dynamic construction of algoithm parameters with runtime type information for +parameter combinations. This has been isolated from schema.py as it is not declarative, +and rather mainly contains validators and lower-level pydantic code. +""" +from typing import Any, cast, Union + +from spras.runner import algorithms +from pydantic import BaseModel, create_model + +__all__ = ['AlgorithmUnion'] + +def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseModel]: + """ + Dynamically constructs a parameter-combination model based on the original args model. + This is the most 'hacky' part of this code, but, thanks to pydantic, we almost* + avoid reflection and preserve rich type information. + """ + # First, we need to take our 'model' and coerce it to permit parameter combinations. + # This assumes that all of the keys are flattened, so we only get a structure like so: + # class AlgorithmParams(BaseModel): + # key1: int + # key2: list[str] + # ... + # and we want to transform this to: + # class AlgorithmParamsCombination(BaseModel): + # key1: list[int] + # key2: list[list[str]] + # This function does not worry about getting the cartesian product of this. + + # Map our fields to a list (assuming we have no nested keys) + mapped_list_field: dict[str, type[list[Any]]] = {name: list[field.annotation] for name, field in model.model_fields.items()} + + # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields + for key in mapped_list_field.keys(): + assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema." + \ + "This should have been caught by the Snakemake CI step." + + # Pass this as kwargs to create_model, which usually takes in parameters field_name=type. + # This is the asterisk (*) from the docstring: we do need to cast create_model, since otherwise + # the type-checker complains that we may have had a key that starts with __ in mapped_list_fields. + # The above assertion prevents this. + run_model = (cast(Any, create_model))( + f'{name}RunModel', + **mapped_list_field + ) + + # Here is an example of how this would look like inside config.yaml + # name: pathlinker + # include: true + # runs: + # run1: + # (from run_model) + # ... + return create_model( + f'{name}Model', + name=name, + include=bool, + runs=dict[str, run_model] + ) + +algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model) for name, (_, model) in algorithms.items()] +AlgorithmUnion = Union[tuple(algorithm_models)] diff --git a/spras/config/schema.py b/spras/config/schema.py index 623c9dd9b..7a42673d6 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -11,13 +11,13 @@ """ import re -from typing import Annotated, Optional +from typing import Annotated from pydantic import AfterValidator, BaseModel, ConfigDict, Field +from spras.config.algorithms import AlgorithmUnion from spras.config.util import CaseInsensitiveEnum - class SummaryAnalysis(BaseModel): include: bool @@ -87,21 +87,6 @@ class ContainerRegistry(BaseModel): model_config = ConfigDict(extra='forbid') -class AlgorithmParams(BaseModel): - include: bool - directed: Optional[bool] = None - - # TODO: use array of runs instead. We currently rely on the - # extra parameters here to extract the algorithm parameter information, - # which is why this deviates from the usual ConfigDict(extra='forbid'). - model_config = ConfigDict(extra='allow') - -class Algorithm(BaseModel): - name: str - params: AlgorithmParams - - model_config = ConfigDict(extra='forbid') - class Dataset(BaseModel): label: Annotated[str, AfterValidator(label_validator("Dataset"))] node_files: list[str] @@ -139,7 +124,7 @@ class RawConfig(BaseModel): description="The length of the hash used to identify a parameter combination", default=DEFAULT_HASH_LENGTH) - algorithms: list[Algorithm] + algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this. datasets: list[Dataset] gold_standards: list[GoldStandard] = [] analysis: Analysis = Analysis() diff --git a/spras/runner.py b/spras/runner.py index a023a9606..843b3cf46 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,35 +1,38 @@ from typing import Any +from pydantic import BaseModel + # supported algorithm imports from spras.allpairs import AllPairs from spras.btb import BowTieBuilder from spras.dataset import Dataset -from spras.domino import DOMINO -from spras.meo import MEO -from spras.mincostflow import MinCostFlow -from spras.omicsintegrator1 import OmicsIntegrator1 -from spras.omicsintegrator2 import OmicsIntegrator2 -from spras.pathlinker import PathLinker +from spras.config.util import Empty +from spras.domino import DOMINO, DominoParams +from spras.meo import MEO, MEOParams +from spras.mincostflow import MinCostFlow, MinCostFlowParams +from spras.omicsintegrator1 import OmicsIntegrator1, OmicsIntegrator1Params +from spras.omicsintegrator2 import OmicsIntegrator2, OmicsIntegrator2Params +from spras.pathlinker import PathLinker, PathLinkerParams from spras.prm import PRM -from spras.rwr import RWR -from spras.strwr import ST_RWR +from spras.rwr import RWR, RWRParams +from spras.strwr import ST_RWR, ST_RWRParams -algorithms: dict[str, type[PRM]] = { - "allpairs": AllPairs, - "bowtiebuilder": BowTieBuilder, - "domino": DOMINO, - "meo": MEO, - "mincostflow": MinCostFlow, - "omicsintegrator1": OmicsIntegrator1, - "omicsintegrator2": OmicsIntegrator2, - "pathlinker": PathLinker, - "rwr": RWR, - "strwr": ST_RWR, +algorithms: dict[str, tuple[type[PRM], type[BaseModel]]] = { + "allpairs": (AllPairs, Empty), + "bowtiebuilder": (BowTieBuilder, Empty), + "domino": (DOMINO, DominoParams), + "meo": (MEO, MEOParams), + "mincostflow": (MinCostFlow, MinCostFlowParams), + "omicsintegrator1": (OmicsIntegrator1, OmicsIntegrator1Params), + "omicsintegrator2": (OmicsIntegrator2, OmicsIntegrator2Params), + "pathlinker": (PathLinker, PathLinkerParams), + "rwr": (RWR, RWRParams), + "strwr": (ST_RWR, ST_RWRParams), } def get_algorithm(algorithm: str) -> type[PRM]: try: - return algorithms[algorithm.lower()] + return algorithms[algorithm.lower()][0] except KeyError as exc: raise NotImplementedError(f'{algorithm} is not currently supported.') from exc From 53f55e27a7bc1040c77c8941746156f497c214e4 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 22:59:58 +0000 Subject: [PATCH 10/68] fix: clean up type errors, begin nondetermnism --- spras/config/algorithms.py | 13 ++++++------- spras/config/schema.py | 4 ++++ spras/config/util.py | 30 ++++++++++++++++++++++++++++-- spras/domino.py | 5 +++-- spras/omicsintegrator1.py | 7 ++++--- 5 files changed, 45 insertions(+), 14 deletions(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 9e78788f4..bc7b896fc 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -3,7 +3,7 @@ parameter combinations. This has been isolated from schema.py as it is not declarative, and rather mainly contains validators and lower-level pydantic code. """ -from typing import Any, cast, Union +from typing import Any, cast, Union, Literal from spras.runner import algorithms from pydantic import BaseModel, create_model @@ -13,8 +13,8 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseModel]: """ Dynamically constructs a parameter-combination model based on the original args model. - This is the most 'hacky' part of this code, but, thanks to pydantic, we almost* - avoid reflection and preserve rich type information. + This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection + and preserve rich type information at runtime. """ # First, we need to take our 'model' and coerce it to permit parameter combinations. # This assumes that all of the keys are flattened, so we only get a structure like so: @@ -37,9 +37,8 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod "This should have been caught by the Snakemake CI step." # Pass this as kwargs to create_model, which usually takes in parameters field_name=type. - # This is the asterisk (*) from the docstring: we do need to cast create_model, since otherwise - # the type-checker complains that we may have had a key that starts with __ in mapped_list_fields. - # The above assertion prevents this. + # We do need to cast create_model, since otherwise the type-checker complains that we may + # have had a key that starts with __ in mapped_list_fields. The above assertion prevents this. run_model = (cast(Any, create_model))( f'{name}RunModel', **mapped_list_field @@ -54,7 +53,7 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod # ... return create_model( f'{name}Model', - name=name, + name=Literal[name], include=bool, runs=dict[str, run_model] ) diff --git a/spras/config/schema.py b/spras/config/schema.py index 7a42673d6..76404b387 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -124,6 +124,7 @@ class RawConfig(BaseModel): description="The length of the hash used to identify a parameter combination", default=DEFAULT_HASH_LENGTH) + # See algorithms.py for more information about AlgorithmUnion algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this. datasets: list[Dataset] gold_standards: list[GoldStandard] = [] @@ -132,3 +133,6 @@ class RawConfig(BaseModel): reconstruction_settings: ReconstructionSettings model_config = ConfigDict(extra='forbid') + +# AlgorithmUnion is dynamically constructed. +RawConfig.model_rebuild() diff --git a/spras/config/util.py b/spras/config/util.py index 32f19076f..0ed99a26e 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -1,7 +1,14 @@ +""" +General config utilities. This is the only config file +that should be imported by algorithms, and algorithms should +only import this config file. +""" + from enum import Enum +import time from typing import Any -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field # https://stackoverflow.com/a/76883868/7589775 @@ -23,6 +30,25 @@ def _missing_(cls, value: Any): class Empty(BaseModel): """ - The empty base model. Used for specifying that an algorithm takes no parameters. + The empty base model. Used for specifying that an algorithm takes no parameters, + yet are deterministic. """ model_config = ConfigDict(extra="forbid") + +class NondeterministicModel(BaseModel): + """ + A nondeterministic model. Any seedless nondeterministic algorithm should extend this. + Internally, this inserts a _time parameter that can be serialized but not + deserialized, and will affect the hash. + """ + + # We don't make this a PrivateAttr for reasons explained in the doc comment. + time: float = Field(default_factory=time.time, alias="_time") + """ + The internal _time parameter. This is a parameter only given to nondeterminsitic + algorithms that provide no randomness seed. While this should be unset, + we allow specifying `_time` for users that want to re-use outputs of runs, + though this explicitly breaks the 'immutability' promise of runs. + """ + + model_config = ConfigDict(use_attribute_docstrings=True) diff --git a/spras/domino.py b/spras/domino.py index 30ccc8a84..a9ce7a43b 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -3,9 +3,10 @@ from typing import Optional import pandas as pd -from pydantic import BaseModel, ConfigDict +from pydantic import ConfigDict from spras.containers import prepare_volume, run_container_and_log +from spras.config.util import NondeterministicModel from spras.interactome import ( add_constant, reinsert_direction_col_undirected, @@ -18,7 +19,7 @@ ID_PREFIX = 'ENSG0' ID_PREFIX_LEN = len(ID_PREFIX) -class DominoParams(BaseModel): +class DominoParams(NondeterministicModel): module_threshold: Optional[float] = None "the p-value threshold for considering a slice as relevant (optional)" diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 74f55bff7..ddb934bb5 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,7 +1,8 @@ from pathlib import Path +import time from typing import Optional -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed @@ -52,8 +53,8 @@ class OmicsIntegrator1Params(BaseModel): random_terminals: int = 0 "How many times to apply the given prizes to random nodes in the interactome" - seed: Optional[int] = None - "the randomness seed to use" + seed: int = Field(default_factory=lambda _: int(time.time() * 1000)) + "The random seed to use for this run. Defaults to the current UNIX timestamp." w: int "the number of trees" From a4e265d2a2930b3b7b2e44a2907d64be9a08f09c Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 23:15:07 +0000 Subject: [PATCH 11/68] chore: begin little utility --- config/config.yaml | 4 ++-- util/update_schema.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 util/update_schema.py diff --git a/config/config.yaml b/config/config.yaml index 5d23946d4..7bb58dcdf 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -73,8 +73,8 @@ algorithms: g: 3 - name: "meo" - params: - include: true + include: true + runs: run1: max_path_length: 3 local_search: "Yes" diff --git a/util/update_schema.py b/util/update_schema.py new file mode 100644 index 000000000..c6a7bedca --- /dev/null +++ b/util/update_schema.py @@ -0,0 +1,13 @@ +""" +Updates config/schema.json. +This should be done whenever a new algorithm is introduced, +or the config is otherwise directly changed. +""" + +import json +from pathlib import Path + +from spras.config.schema import RawConfig + +config_schema = RawConfig.model_json_schema() +Path('config/schema.json').write_text(json.dumps(config_schema, indent=2)) From 145b2ec9c16b736a7e2939cd257f69f0abc456cf Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 16:46:12 -0700 Subject: [PATCH 12/68] chore: mv container schema changes over --- config/config.yaml | 50 +++++++++++++------------ spras/config/config.py | 23 ++---------- spras/config/container_schema.py | 64 ++++++++++++++++++++++++++++++++ spras/config/schema.py | 6 +-- spras/containers.py | 20 +++++----- 5 files changed, 105 insertions(+), 58 deletions(-) create mode 100644 spras/config/container_schema.py diff --git a/config/config.yaml b/config/config.yaml index 7bb58dcdf..a834131e6 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,30 +3,32 @@ # The length of the hash used to identify a parameter combination hash_length: 7 -# Specify the container framework used by each PRM wrapper. Valid options include: -# - docker (default if not specified) -# - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed -# - dsub -- experimental with limited support, used for running on Google Cloud with the All of Us cloud environment. -# - There is no support for other environments at the moment. -container_framework: docker - -# Only used if container_framework is set to singularity, this will unpack the singularity containers -# to the local filesystem. This is useful when PRM containers need to run inside another container, -# such as would be the case in an HTCondor/OSPool environment. -# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way -# that persists after the workflow is complete. To clean up the unpacked containers, the user must -# manually delete them. For convenience, these unpacked files will exist in the current working directory -# under `unpacked`. -unpack_singularity: false - -# Allow the user to configure which container registry containers should be pulled from -# Note that this assumes container names are consistent across registries, and that the -# registry being passed doesn't require authentication for pull actions -container_registry: - base_url: docker.io - # The owner or project of the registry - # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs - owner: reedcompbio +# Collection of container options +containers: + # Specify the container framework used by each PRM wrapper. Valid options include: + # - docker (default if not specified) + # - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed + # - dsub -- experimental with limited support, used for running on Google Cloud with the All of Us cloud environment. + # - There is no support for other environments at the moment. + framework: docker + + # Only used if container_framework is set to singularity, this will unpack the singularity containers + # to the local filesystem. This is useful when PRM containers need to run inside another container, + # such as would be the case in an HTCondor/OSPool environment. + # NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way + # that persists after the workflow is complete. To clean up the unpacked containers, the user must + # manually delete them. For convenience, these unpacked files will exist in the current working directory + # under `unpacked`. + unpack_singularity: false + + # Allow the user to configure which container registry containers should be pulled from + # Note that this assumes container names are consistent across registries, and that the + # registry being passed doesn't require authentication for pull actions + registry: + base_url: docker.io + # The owner or project of the registry + # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs + owner: reedcompbio # This list of algorithms should be generated by a script which checks the filesystem for installs. # It shouldn't be changed by mere mortals. (alternatively, we could add a path to executable for each algorithm diff --git a/spras/config/config.py b/spras/config/config.py index c6ac8f8e0..6d6ee4b7e 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,13 +23,12 @@ import numpy as np import yaml -from spras.config.schema import Analysis, ContainerFramework, RawConfig +from spras.config.container_schema import ProcessedContainerOptions +from spras.config.schema import Analysis, RawConfig from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None -DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" - # This will get called in the Snakefile, instantiating the singleton with the raw config def init_global(config_dict): global config @@ -67,9 +66,7 @@ def __init__(self, raw_config: dict[str, Any]): # Directory used for storing output self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity" - self.container_framework = None - # The container prefix (host and organization) to use for images. Default is "docker.io/reedcompbio" - self.container_prefix: str = DEFAULT_CONTAINER_PREFIX + self.container_settings = ProcessedContainerOptions.from_container_settings(parsed_raw_config.containers, parsed_raw_config.hash_length) # A Boolean specifying whether to unpack singularity containers. Default is False self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run @@ -275,22 +272,8 @@ def process_analysis(self, raw_config: RawConfig): self.analysis_include_evaluation_aggregate_algo = False def process_config(self, raw_config: RawConfig): - # Set up a few top-level config variables self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir - if raw_config.container_framework == ContainerFramework.dsub: - warnings.warn("'dsub' framework integration is experimental and may not be fully supported.", stacklevel=2) - self.container_framework = raw_config.container_framework - - # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. - if raw_config.unpack_singularity and self.container_framework != "singularity": - warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.", stacklevel=2) - self.unpack_singularity = raw_config.unpack_singularity - - # Grab registry from the config, and if none is provided default to docker - if raw_config.container_registry and raw_config.container_registry.base_url != "" and raw_config.container_registry.owner != "": - self.container_prefix = raw_config.container_registry.base_url + "/" + raw_config.container_registry.owner - self.process_datasets(raw_config) self.process_algorithms(raw_config) self.process_analysis(raw_config) diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py new file mode 100644 index 000000000..d515e0de0 --- /dev/null +++ b/spras/config/container_schema.py @@ -0,0 +1,64 @@ +""" +The separate container schema specification file. +For information about pydantic, see schema.py. + +We move this to a separate file to allow `containers.py` to explicitly take in +this subsection of the configuration. +""" + +from dataclasses import dataclass +from pydantic import BaseModel, ConfigDict, Field +from typing import Optional +import warnings + +from spras.config.util import CaseInsensitiveEnum + +DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" + +class ContainerFramework(CaseInsensitiveEnum): + docker = 'docker' + # TODO: add apptainer variant once #260 gets merged + singularity = 'singularity' + dsub = 'dsub' + +class ContainerRegistry(BaseModel): + base_url: str + owner: str = Field(description="The owner or project of the registry") + + model_config = ConfigDict(extra='forbid') + +class ContainerSettings(BaseModel): + framework: ContainerFramework = ContainerFramework.docker + unpack_singularity: bool = False + registry: ContainerRegistry + hash_length: Optional[int] = None + +@dataclass +class ProcessedContainerOptions: + container_framework: ContainerFramework + unpack_singularity: bool + container_prefix: str + hash_length: int + + @staticmethod + def from_container_settings(settings: ContainerSettings, default_hash_length: int) -> "ProcessedContainerOptions": + if settings.framework == ContainerFramework.dsub: + warnings.warn("'dsub' framework integration is experimental and may not be fully supported.", stacklevel=2) + container_framework = settings.framework + + # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. + if settings.unpack_singularity and container_framework != "singularity": + warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.", stacklevel=2) + unpack_singularity = settings.unpack_singularity + + # Grab registry from the config, and if none is provided default to docker + container_prefix = DEFAULT_CONTAINER_PREFIX + if settings.registry and settings.registry.base_url != "" and settings.registry.owner != "": + container_prefix = settings.registry.base_url + "/" + settings.registry.owner + + return ProcessedContainerOptions( + container_framework=container_framework, + unpack_singularity=unpack_singularity, + container_prefix=container_prefix, + hash_length=settings.hash_length or default_hash_length + ) diff --git a/spras/config/schema.py b/spras/config/schema.py index 76404b387..7657a41a0 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -16,6 +16,7 @@ from pydantic import AfterValidator, BaseModel, ConfigDict, Field from spras.config.algorithms import AlgorithmUnion +from spras.config.container_schema import ContainerSettings from spras.config.util import CaseInsensitiveEnum class SummaryAnalysis(BaseModel): @@ -115,10 +116,7 @@ class ReconstructionSettings(BaseModel): model_config = ConfigDict(extra='forbid') class RawConfig(BaseModel): - # TODO: move these container values to a nested container key - container_framework: ContainerFramework = ContainerFramework.docker - unpack_singularity: bool = False - container_registry: ContainerRegistry + containers: ContainerSettings hash_length: int = Field( description="The length of the hash used to identify a parameter combination", diff --git a/spras/containers.py b/spras/containers.py index 314d4bb45..e41d4737e 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -8,7 +8,7 @@ import docker import docker.errors -import spras.config.config as config +from spras.config.container_schema import ProcessedContainerOptions from spras.logging import indent from spras.util import hash_filename @@ -131,7 +131,7 @@ def env_to_items(environment: dict[str, str]) -> Iterator[str]: # TODO consider a better default environment variable # Follow docker-py's naming conventions (https://docker-py.readthedocs.io/en/stable/containers.html) # Technically the argument is an image, not a container, but we use container here. -def run_container(framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: Optional[dict[str, str]] = None): +def run_container(framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity or Docker @param framework: singularity or docker @@ -144,17 +144,17 @@ def run_container(framework: str, container_suffix: str, command: List[str], vol """ normalized_framework = framework.casefold() - container = config.config.container_prefix + "/" + container_suffix + container = config.container_prefix + "/" + container_suffix if normalized_framework == 'docker': return run_container_docker(container, command, volumes, working_dir, environment) elif normalized_framework == 'singularity': - return run_container_singularity(container, command, volumes, working_dir, environment) + return run_container_singularity(container, command, volumes, working_dir, config, environment) elif normalized_framework == 'dsub': return run_container_dsub(container, command, volumes, working_dir, environment) else: raise ValueError(f'{framework} is not a recognized container framework. Choose "docker", "dsub", or "singularity".') -def run_container_and_log(name: str, framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: Optional[dict[str, str]] = None): +def run_container_and_log(name: str, framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity or Docker with associated pretty printed messages. @param name: the display name of the running container for logging purposes @@ -171,7 +171,7 @@ def run_container_and_log(name: str, framework: str, container_suffix: str, comm print('Running {} on container framework "{}" on env {} with command: {}'.format(name, framework, list(env_to_items(environment)), ' '.join(command)), flush=True) try: - out = run_container(framework=framework, container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, environment=environment) + out = run_container(framework=framework, container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, config=config, environment=environment) if out is not None: if isinstance(out, list): out = ''.join(out) @@ -290,7 +290,7 @@ def run_container_docker(container: str, command: List[str], volumes: List[Tuple return out -def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: Optional[dict[str, str]] = None): +def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity. Only available on Linux. @@ -329,7 +329,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ singularity_options.extend(['--env', ",".join(env_to_items(environment))]) # Handle unpacking singularity image if needed. Potentially needed for running nested unprivileged containers - if config.config.unpack_singularity: + if config.unpack_singularity: # Split the string by "/" path_elements = container.split("/") @@ -369,7 +369,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ # Because this is called independently for each file, the same local path can be mounted to multiple volumes -def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]: +def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath], config: ProcessedContainerOptions) -> Tuple[Tuple[PurePath, PurePath], str]: """ Makes a file on the local file system accessible within a container by mapping the local (source) path to a new container (destination) path and renaming the file to be relative to the destination path. @@ -388,7 +388,7 @@ def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, Pu if isinstance(filename, os.PathLike): filename = str(filename) - filename_hash = hash_filename(filename, config.config.hash_length) + filename_hash = hash_filename(filename, config.hash_length) dest = PurePosixPath(base_path, filename_hash) abs_filename = Path(filename).resolve() From 5effe6980aebcb57dfd61b588ab39c93dcc33cbd Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 00:00:07 +0000 Subject: [PATCH 13/68] feat: initial schema --- config/schema.json | 1153 ++++++++++++++++++++++++++++++ spras/allpairs.py | 20 +- spras/btb.py | 15 +- spras/config/config.py | 4 +- spras/config/container_schema.py | 20 +- spras/containers.py | 26 +- spras/prm.py | 5 +- 7 files changed, 1196 insertions(+), 47 deletions(-) create mode 100644 config/schema.json diff --git a/config/schema.json b/config/schema.json new file mode 100644 index 000000000..f99541d51 --- /dev/null +++ b/config/schema.json @@ -0,0 +1,1153 @@ +{ + "$defs": { + "Analysis": { + "additionalProperties": false, + "properties": { + "summary": { + "$ref": "#/$defs/SummaryAnalysis", + "default": { + "include": false + } + }, + "cytoscape": { + "$ref": "#/$defs/CytoscapeAnalysis", + "default": { + "include": false + } + }, + "ml": { + "$ref": "#/$defs/MlAnalysis", + "default": { + "include": false, + "aggregate_per_algorithm": false, + "components": 2, + "labels": true, + "linkage": "ward", + "metric": "euclidean" + } + }, + "evaluation": { + "$ref": "#/$defs/EvaluationAnalysis", + "default": { + "include": false, + "aggregate_per_algorithm": false + } + } + }, + "title": "Analysis", + "type": "object" + }, + "ContainerFramework": { + "enum": [ + "docker", + "singularity", + "dsub" + ], + "title": "ContainerFramework", + "type": "string" + }, + "ContainerRegistry": { + "additionalProperties": false, + "properties": { + "base_url": { + "title": "Base Url", + "type": "string" + }, + "owner": { + "description": "The owner or project of the registry", + "title": "Owner", + "type": "string" + } + }, + "required": [ + "base_url", + "owner" + ], + "title": "ContainerRegistry", + "type": "object" + }, + "ContainerSettings": { + "properties": { + "framework": { + "$ref": "#/$defs/ContainerFramework", + "default": "docker" + }, + "unpack_singularity": { + "default": false, + "title": "Unpack Singularity", + "type": "boolean" + }, + "registry": { + "$ref": "#/$defs/ContainerRegistry" + }, + "hash_length": { + "default": 7, + "title": "Hash Length", + "type": "integer" + } + }, + "required": [ + "registry" + ], + "title": "ContainerSettings", + "type": "object" + }, + "CytoscapeAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + } + }, + "required": [ + "include" + ], + "title": "CytoscapeAnalysis", + "type": "object" + }, + "Dataset": { + "additionalProperties": false, + "properties": { + "label": { + "title": "Label", + "type": "string" + }, + "node_files": { + "items": { + "type": "string" + }, + "title": "Node Files", + "type": "array" + }, + "edge_files": { + "items": { + "type": "string" + }, + "title": "Edge Files", + "type": "array" + }, + "other_files": { + "items": { + "type": "string" + }, + "title": "Other Files", + "type": "array" + }, + "data_dir": { + "title": "Data Dir", + "type": "string" + } + }, + "required": [ + "label", + "node_files", + "edge_files", + "other_files", + "data_dir" + ], + "title": "Dataset", + "type": "object" + }, + "EvaluationAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + }, + "aggregate_per_algorithm": { + "default": false, + "title": "Aggregate Per Algorithm", + "type": "boolean" + } + }, + "required": [ + "include" + ], + "title": "EvaluationAnalysis", + "type": "object" + }, + "GoldStandard": { + "additionalProperties": false, + "properties": { + "label": { + "title": "Label", + "type": "string" + }, + "node_files": { + "items": { + "type": "string" + }, + "title": "Node Files", + "type": "array" + }, + "data_dir": { + "title": "Data Dir", + "type": "string" + }, + "dataset_labels": { + "items": { + "type": "string" + }, + "title": "Dataset Labels", + "type": "array" + } + }, + "required": [ + "label", + "node_files", + "data_dir", + "dataset_labels" + ], + "title": "GoldStandard", + "type": "object" + }, + "Locations": { + "additionalProperties": false, + "properties": { + "reconstruction_dir": { + "title": "Reconstruction Dir", + "type": "string" + } + }, + "required": [ + "reconstruction_dir" + ], + "title": "Locations", + "type": "object" + }, + "MlAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + }, + "aggregate_per_algorithm": { + "default": false, + "title": "Aggregate Per Algorithm", + "type": "boolean" + }, + "components": { + "default": 2, + "title": "Components", + "type": "integer" + }, + "labels": { + "default": true, + "title": "Labels", + "type": "boolean" + }, + "linkage": { + "$ref": "#/$defs/MlLinkage", + "default": "ward" + }, + "metric": { + "$ref": "#/$defs/MlMetric", + "default": "euclidean" + } + }, + "required": [ + "include" + ], + "title": "MlAnalysis", + "type": "object" + }, + "MlLinkage": { + "enum": [ + "ward", + "complete", + "average", + "single" + ], + "title": "MlLinkage", + "type": "string" + }, + "MlMetric": { + "enum": [ + "euclidean", + "manhattan", + "cosine" + ], + "title": "MlMetric", + "type": "string" + }, + "ReconstructionSettings": { + "additionalProperties": false, + "properties": { + "locations": { + "$ref": "#/$defs/Locations" + } + }, + "required": [ + "locations" + ], + "title": "ReconstructionSettings", + "type": "object" + }, + "SummaryAnalysis": { + "additionalProperties": false, + "properties": { + "include": { + "title": "Include", + "type": "boolean" + } + }, + "required": [ + "include" + ], + "title": "SummaryAnalysis", + "type": "object" + }, + "allpairsModel": { + "properties": { + "name": { + "const": "allpairs", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/allpairsRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "allpairsModel", + "type": "object" + }, + "allpairsRunModel": { + "properties": {}, + "title": "allpairsRunModel", + "type": "object" + }, + "bowtiebuilderModel": { + "properties": { + "name": { + "const": "bowtiebuilder", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/bowtiebuilderRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "bowtiebuilderModel", + "type": "object" + }, + "bowtiebuilderRunModel": { + "properties": {}, + "title": "bowtiebuilderRunModel", + "type": "object" + }, + "dominoModel": { + "properties": { + "name": { + "const": "domino", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/dominoRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "dominoModel", + "type": "object" + }, + "dominoRunModel": { + "properties": { + "time": { + "items": { + "type": "number" + }, + "title": "Time", + "type": "array" + }, + "module_threshold": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Module Threshold", + "type": "array" + }, + "slice_threshold": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Slice Threshold", + "type": "array" + } + }, + "required": [ + "time", + "module_threshold", + "slice_threshold" + ], + "title": "dominoRunModel", + "type": "object" + }, + "meoModel": { + "properties": { + "name": { + "const": "meo", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/meoRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "meoModel", + "type": "object" + }, + "meoRunModel": { + "properties": { + "max_path_length": { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "title": "Max Path Length", + "type": "array" + }, + "local_search": { + "items": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] + }, + "title": "Local Search", + "type": "array" + }, + "rand_restarts": { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "title": "Rand Restarts", + "type": "array" + } + }, + "required": [ + "max_path_length", + "local_search", + "rand_restarts" + ], + "title": "meoRunModel", + "type": "object" + }, + "mincostflowModel": { + "properties": { + "name": { + "const": "mincostflow", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/mincostflowRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "mincostflowModel", + "type": "object" + }, + "mincostflowRunModel": { + "properties": { + "flow": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Flow", + "type": "array" + }, + "capacity": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Capacity", + "type": "array" + } + }, + "required": [ + "flow", + "capacity" + ], + "title": "mincostflowRunModel", + "type": "object" + }, + "omicsintegrator1Model": { + "properties": { + "name": { + "const": "omicsintegrator1", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/omicsintegrator1RunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "omicsintegrator1Model", + "type": "object" + }, + "omicsintegrator1RunModel": { + "properties": { + "dummy_mode": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "title": "Dummy Mode", + "type": "array" + }, + "mu_squared": { + "items": { + "type": "boolean" + }, + "title": "Mu Squared", + "type": "array" + }, + "exclude_terms": { + "items": { + "type": "boolean" + }, + "title": "Exclude Terms", + "type": "array" + }, + "noisy_edges": { + "items": { + "type": "integer" + }, + "title": "Noisy Edges", + "type": "array" + }, + "shuffled_prizes": { + "items": { + "type": "integer" + }, + "title": "Shuffled Prizes", + "type": "array" + }, + "random_terminals": { + "items": { + "type": "integer" + }, + "title": "Random Terminals", + "type": "array" + }, + "seed": { + "items": { + "type": "integer" + }, + "title": "Seed", + "type": "array" + }, + "w": { + "items": { + "type": "integer" + }, + "title": "W", + "type": "array" + }, + "b": { + "items": { + "type": "number" + }, + "title": "B", + "type": "array" + }, + "d": { + "items": { + "type": "integer" + }, + "title": "D", + "type": "array" + }, + "mu": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Mu", + "type": "array" + }, + "noise": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Noise", + "type": "array" + }, + "g": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "G", + "type": "array" + }, + "r": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "R", + "type": "array" + } + }, + "required": [ + "dummy_mode", + "mu_squared", + "exclude_terms", + "noisy_edges", + "shuffled_prizes", + "random_terminals", + "seed", + "w", + "b", + "d", + "mu", + "noise", + "g", + "r" + ], + "title": "omicsintegrator1RunModel", + "type": "object" + }, + "omicsintegrator2Model": { + "properties": { + "name": { + "const": "omicsintegrator2", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/omicsintegrator2RunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "omicsintegrator2Model", + "type": "object" + }, + "omicsintegrator2RunModel": { + "properties": { + "w": { + "items": { + "type": "number" + }, + "title": "W", + "type": "array" + }, + "b": { + "items": { + "type": "number" + }, + "title": "B", + "type": "array" + }, + "g": { + "items": { + "type": "number" + }, + "title": "G", + "type": "array" + }, + "noise": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Noise", + "type": "array" + }, + "noisy_edges": { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "title": "Noisy Edges", + "type": "array" + }, + "random_terminals": { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "title": "Random Terminals", + "type": "array" + }, + "dummy_mode": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "title": "Dummy Mode", + "type": "array" + }, + "seed": { + "items": { + "type": "integer" + }, + "title": "Seed", + "type": "array" + } + }, + "required": [ + "w", + "b", + "g", + "noise", + "noisy_edges", + "random_terminals", + "dummy_mode", + "seed" + ], + "title": "omicsintegrator2RunModel", + "type": "object" + }, + "pathlinkerModel": { + "properties": { + "name": { + "const": "pathlinker", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/pathlinkerRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "pathlinkerModel", + "type": "object" + }, + "pathlinkerRunModel": { + "properties": { + "k": { + "items": { + "type": "integer" + }, + "title": "K", + "type": "array" + } + }, + "required": [ + "k" + ], + "title": "pathlinkerRunModel", + "type": "object" + }, + "rwrModel": { + "properties": { + "name": { + "const": "rwr", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/rwrRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "rwrModel", + "type": "object" + }, + "rwrRunModel": { + "properties": { + "threshold": { + "items": { + "type": "integer" + }, + "title": "Threshold", + "type": "array" + }, + "alpha": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Alpha", + "type": "array" + } + }, + "required": [ + "threshold", + "alpha" + ], + "title": "rwrRunModel", + "type": "object" + }, + "strwrModel": { + "properties": { + "name": { + "const": "strwr", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/strwrRunModel" + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include", + "runs" + ], + "title": "strwrModel", + "type": "object" + }, + "strwrRunModel": { + "properties": { + "threshold": { + "items": { + "type": "integer" + }, + "title": "Threshold", + "type": "array" + }, + "alpha": { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] + }, + "title": "Alpha", + "type": "array" + } + }, + "required": [ + "threshold", + "alpha" + ], + "title": "strwrRunModel", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "containers": { + "$ref": "#/$defs/ContainerSettings" + }, + "hash_length": { + "default": 7, + "description": "The length of the hash used to identify a parameter combination", + "title": "Hash Length", + "type": "integer" + }, + "algorithms": { + "items": { + "anyOf": [ + { + "$ref": "#/$defs/allpairsModel" + }, + { + "$ref": "#/$defs/bowtiebuilderModel" + }, + { + "$ref": "#/$defs/dominoModel" + }, + { + "$ref": "#/$defs/meoModel" + }, + { + "$ref": "#/$defs/mincostflowModel" + }, + { + "$ref": "#/$defs/omicsintegrator1Model" + }, + { + "$ref": "#/$defs/omicsintegrator2Model" + }, + { + "$ref": "#/$defs/pathlinkerModel" + }, + { + "$ref": "#/$defs/rwrModel" + }, + { + "$ref": "#/$defs/strwrModel" + } + ] + }, + "title": "Algorithms", + "type": "array" + }, + "datasets": { + "items": { + "$ref": "#/$defs/Dataset" + }, + "title": "Datasets", + "type": "array" + }, + "gold_standards": { + "default": [], + "items": { + "$ref": "#/$defs/GoldStandard" + }, + "title": "Gold Standards", + "type": "array" + }, + "analysis": { + "$ref": "#/$defs/Analysis", + "default": { + "summary": { + "include": false + }, + "cytoscape": { + "include": false + }, + "ml": { + "aggregate_per_algorithm": false, + "components": 2, + "include": false, + "labels": true, + "linkage": "ward", + "metric": "euclidean" + }, + "evaluation": { + "aggregate_per_algorithm": false, + "include": false + } + } + }, + "reconstruction_settings": { + "$ref": "#/$defs/ReconstructionSettings" + } + }, + "required": [ + "containers", + "algorithms", + "datasets", + "reconstruction_settings" + ], + "title": "RawConfig", + "type": "object" +} \ No newline at end of file diff --git a/spras/allpairs.py b/spras/allpairs.py index 670d3f721..bba5df467 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -1,6 +1,7 @@ import warnings from pathlib import Path +from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -72,14 +73,7 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): - """ - Run All Pairs Shortest Paths with Docker - @param nodetypes: input node types with sources and targets (required) - @param network: input network file (required) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - @param output_file: path to the output pathway file (required) - """ + def run(inputs, output_file, args=None, container_settings=ProcessedContainerSettings()): if not inputs["nodetypes"] or not inputs["network"] or not inputs["directed_flag"]: raise ValueError('Required All Pairs Shortest Paths arguments are missing') @@ -88,15 +82,15 @@ def run(inputs, output_file, args=None, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(inputs["network"], work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # Create the parent directories for the output file if needed Path(output_file).parent.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_file = prepare_volume(output_file, work_dir) + bind_path, mapped_out_file = prepare_volume(output_file, work_dir, container_settings) volumes.append(bind_path) command = ['python', @@ -110,11 +104,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): container_suffix = "allpairs:v4" run_container_and_log( 'All Pairs Shortest Paths', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) @staticmethod def parse_output(raw_pathway_file, standardized_pathway_file, params): diff --git a/spras/btb.py b/spras/btb.py index 81474bdb2..7f7a1b944 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -1,5 +1,6 @@ from pathlib import Path +from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -65,7 +66,7 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): + def run(inputs, output_file, args=None, container_settings=ProcessedContainerSettings()): # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide @@ -93,19 +94,19 @@ def run(inputs, output_file, args=None, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(inputs["sources"], work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(inputs["targets"], work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(inputs["edges"], work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) # Use its --output argument to set the output file prefix to specify an absolute path and prefix out_dir = Path(output_file).parent out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + '/raw-pathway.txt' # Use posix path inside the container @@ -122,11 +123,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): container_suffix = "bowtiebuilder:v2" run_container_and_log('BowTieBuilder', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # Output is already written to raw-pathway.txt file diff --git a/spras/config/config.py b/spras/config/config.py index 6d6ee4b7e..252d6ccf5 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,7 +23,7 @@ import numpy as np import yaml -from spras.config.container_schema import ProcessedContainerOptions +from spras.config.container_schema import ProcessedContainerSettings from spras.config.schema import Analysis, RawConfig from spras.util import NpHashEncoder, hash_params_sha1_base32 @@ -66,7 +66,7 @@ def __init__(self, raw_config: dict[str, Any]): # Directory used for storing output self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity" - self.container_settings = ProcessedContainerOptions.from_container_settings(parsed_raw_config.containers, parsed_raw_config.hash_length) + self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, parsed_raw_config.hash_length) # A Boolean specifying whether to unpack singularity containers. Default is False self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py index d515e0de0..9688a9b51 100644 --- a/spras/config/container_schema.py +++ b/spras/config/container_schema.py @@ -31,17 +31,17 @@ class ContainerSettings(BaseModel): framework: ContainerFramework = ContainerFramework.docker unpack_singularity: bool = False registry: ContainerRegistry - hash_length: Optional[int] = None + hash_length: int = 7 @dataclass -class ProcessedContainerOptions: - container_framework: ContainerFramework - unpack_singularity: bool - container_prefix: str - hash_length: int +class ProcessedContainerSettings: + framework: ContainerFramework = ContainerFramework.docker + unpack_singularity: bool = False + prefix: str = DEFAULT_CONTAINER_PREFIX + hash_length: int = 7 @staticmethod - def from_container_settings(settings: ContainerSettings, default_hash_length: int) -> "ProcessedContainerOptions": + def from_container_settings(settings: ContainerSettings, default_hash_length: int) -> "ProcessedContainerSettings": if settings.framework == ContainerFramework.dsub: warnings.warn("'dsub' framework integration is experimental and may not be fully supported.", stacklevel=2) container_framework = settings.framework @@ -56,9 +56,9 @@ def from_container_settings(settings: ContainerSettings, default_hash_length: in if settings.registry and settings.registry.base_url != "" and settings.registry.owner != "": container_prefix = settings.registry.base_url + "/" + settings.registry.owner - return ProcessedContainerOptions( - container_framework=container_framework, + return ProcessedContainerSettings( + framework=container_framework, unpack_singularity=unpack_singularity, - container_prefix=container_prefix, + prefix=container_prefix, hash_length=settings.hash_length or default_hash_length ) diff --git a/spras/containers.py b/spras/containers.py index e41d4737e..d065b2ea8 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -8,7 +8,7 @@ import docker import docker.errors -from spras.config.container_schema import ProcessedContainerOptions +from spras.config.container_schema import ProcessedContainerSettings from spras.logging import indent from spras.util import hash_filename @@ -131,47 +131,47 @@ def env_to_items(environment: dict[str, str]) -> Iterator[str]: # TODO consider a better default environment variable # Follow docker-py's naming conventions (https://docker-py.readthedocs.io/en/stable/containers.html) # Technically the argument is an image, not a container, but we use container here. -def run_container(framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): +def run_container(container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, container_settings: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity or Docker - @param framework: singularity or docker @param container_suffix: name of the DockerHub container without the 'docker://' prefix @param command: command to run in the container @param volumes: a list of volumes to mount where each item is a (source, destination) tuple @param working_dir: the working directory in the container + @param container_settings: the settings to use to run the container @param environment: environment variables to set in the container @return: output from Singularity execute or Docker run """ - normalized_framework = framework.casefold() + normalized_framework = container_settings.framework.casefold() - container = config.container_prefix + "/" + container_suffix + container = container_settings.prefix + "/" + container_suffix if normalized_framework == 'docker': return run_container_docker(container, command, volumes, working_dir, environment) elif normalized_framework == 'singularity': - return run_container_singularity(container, command, volumes, working_dir, config, environment) + return run_container_singularity(container, command, volumes, working_dir, container_settings, environment) elif normalized_framework == 'dsub': return run_container_dsub(container, command, volumes, working_dir, environment) else: - raise ValueError(f'{framework} is not a recognized container framework. Choose "docker", "dsub", or "singularity".') + raise ValueError(f'{container_settings.framework} is not a recognized container framework. Choose "docker", "dsub", or "singularity".') -def run_container_and_log(name: str, framework: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): +def run_container_and_log(name: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, container_settings: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity or Docker with associated pretty printed messages. @param name: the display name of the running container for logging purposes - @param framework: singularity or docker @param container_suffix: name of the DockerHub container without the 'docker://' prefix @param command: command to run in the container @param volumes: a list of volumes to mount where each item is a (source, destination) tuple @param working_dir: the working directory in the container + @param container_settings: the container settings to use @param environment: environment variables to set in the container @return: output from Singularity execute or Docker run """ if not environment: environment = {'SPRAS': 'True'} - print('Running {} on container framework "{}" on env {} with command: {}'.format(name, framework, list(env_to_items(environment)), ' '.join(command)), flush=True) + print('Running {} on container framework "{}" on env {} with command: {}'.format(name, container_settings.framework, list(env_to_items(environment)), ' '.join(command)), flush=True) try: - out = run_container(framework=framework, container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, config=config, environment=environment) + out = run_container(container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, container_settings=container_settings, environment=environment) if out is not None: if isinstance(out, list): out = ''.join(out) @@ -290,7 +290,7 @@ def run_container_docker(container: str, command: List[str], volumes: List[Tuple return out -def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerOptions, environment: Optional[dict[str, str]] = None): +def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, config: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity. Only available on Linux. @@ -369,7 +369,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ # Because this is called independently for each file, the same local path can be mounted to multiple volumes -def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath], config: ProcessedContainerOptions) -> Tuple[Tuple[PurePath, PurePath], str]: +def prepare_volume(filename: Union[str, os.PathLike], volume_base: Union[str, PurePath], config: ProcessedContainerSettings) -> Tuple[Tuple[PurePath, PurePath], str]: """ Makes a file on the local file system accessible within a container by mapping the local (source) path to a new container (destination) path and renaming the file to be relative to the destination path. diff --git a/spras/prm.py b/spras/prm.py index 73c94454a..d52214083 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -4,6 +4,7 @@ from pydantic import BaseModel +from spras.config.container_schema import ProcessedContainerSettings from spras.dataset import Dataset T = TypeVar('T', bound=BaseModel) @@ -42,10 +43,10 @@ def generate_inputs(data: Dataset, filename_map: dict[str, str]): @staticmethod @abstractmethod - def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_framework="docker"): + def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_settings: ProcessedContainerSettings): """ Runs an algorithm with the specified inputs, algorithm params (T), - the designated output_file, and the desired container_framework. + the designated output_file, and the desired container_settings. """ raise NotImplementedError From 398350e68a3f1776e829c5f7f4823560cd73f7b8 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 15 Jul 2025 08:22:58 -0700 Subject: [PATCH 14/68] feat: more algs schema handling --- config/config.yaml | 30 +- config/schema.json | 1017 ++++++++++++++++++++++++------------ spras/config/algorithms.py | 57 +- 3 files changed, 756 insertions(+), 348 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index a834131e6..3e2127d53 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,3 +1,5 @@ +# yaml-language-server: $schema=./schema.json + # Global workflow control # The length of the hash used to identify a parameter combination @@ -50,14 +52,14 @@ containers: algorithms: - name: "pathlinker" - params: - include: true + include: true + runs: run1: k: range(100,201,100) - name: "omicsintegrator1" - params: - include: true + include: true + runs: run1: b: [5, 6] w: np.linspace(0,5,2) @@ -65,8 +67,8 @@ algorithms: dummy_mode: "file" # Or "terminals", "all", "others" - name: "omicsintegrator2" - params: - include: true + include: true + runs: run1: b: 4 g: 0 @@ -79,7 +81,7 @@ algorithms: runs: run1: max_path_length: 3 - local_search: "Yes" + local_search: true rand_restarts: 10 - name: "mincostflow" @@ -90,8 +92,7 @@ algorithms: capacity: 1 - name: "allpairs" - params: - include: true + include: true - name: "domino" params: @@ -101,22 +102,21 @@ algorithms: module_threshold: 0.05 - name: "strwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "rwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "bowtiebuilder" - params: - include: true + include: true # Here we specify which pathways to run and other file location information. # DataLoader.py can currently only load a single dataset diff --git a/config/schema.json b/config/schema.json index f99541d51..01494a4ea 100644 --- a/config/schema.json +++ b/config/schema.json @@ -393,47 +393,77 @@ }, "dominoRunModel": { "properties": { - "time": { - "items": { - "type": "number" - }, - "title": "Time", - "type": "array" + "_time": { + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The internal _time parameter. This is a parameter only given to nondeterminsitic\nalgorithms that provide no randomness seed. While this should be unset,\nwe allow specifying `_time` for users that want to re-use outputs of runs,\nthough this explicitly breaks the 'immutability' promise of runs.", + "title": "Time" }, "module_threshold": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Module Threshold", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "the p-value threshold for considering a slice as relevant (optional)", + "title": "Module Threshold" }, "slice_threshold": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Slice Threshold", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "the p-value threshold for considering a putative module as final module (optional)", + "title": "Slice Threshold" } }, - "required": [ - "time", - "module_threshold", - "slice_threshold" - ], "title": "dominoRunModel", "type": "object" }, @@ -467,53 +497,84 @@ "meoRunModel": { "properties": { "max_path_length": { - "items": { - "anyOf": [ - { - "type": "integer" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Max Path Length", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "the maximal length of a path from sources and targets to orient.", + "title": "Max Path Length" }, "local_search": { - "items": { - "anyOf": [ - { - "type": "boolean" + "anyOf": [ + { + "type": "boolean" + }, + { + "items": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Local Search", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "a boolean parameter that enables MEO's local search functionality.\nSee \"Improving approximations with local search\" in the associated paper\nfor more information.", + "title": "Local Search" }, "rand_restarts": { - "items": { - "anyOf": [ - { - "type": "integer" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Rand Restarts", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The number of random restarts to do.", + "title": "Rand Restarts" } }, - "required": [ - "max_path_length", - "local_search", - "rand_restarts" - ], "title": "meoRunModel", "type": "object" }, @@ -547,38 +608,58 @@ "mincostflowRunModel": { "properties": { "flow": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Flow", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "amount of flow going through the graph", + "title": "Flow" }, "capacity": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Capacity", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "amount of capacity allowed on each edge", + "title": "Capacity" } }, - "required": [ - "flow", - "capacity" - ], "title": "mincostflowRunModel", "type": "object" }, @@ -612,154 +693,298 @@ "omicsintegrator1RunModel": { "properties": { "dummy_mode": { - "items": { - "anyOf": [ - { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Dummy Mode", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Dummy Mode" }, "mu_squared": { - "items": { - "type": "boolean" - }, - "title": "Mu Squared", - "type": "array" + "anyOf": [ + { + "type": "boolean" + }, + { + "items": { + "type": "boolean" + }, + "type": "array" + } + ], + "default": false, + "title": "Mu Squared" }, "exclude_terms": { - "items": { - "type": "boolean" - }, - "title": "Exclude Terms", - "type": "array" + "anyOf": [ + { + "type": "boolean" + }, + { + "items": { + "type": "boolean" + }, + "type": "array" + } + ], + "default": false, + "title": "Exclude Terms" }, "noisy_edges": { - "items": { - "type": "integer" - }, - "title": "Noisy Edges", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 0, + "description": "How many times you would like to add noise to the given edge values and re-run the algorithm.", + "title": "Noisy Edges" }, "shuffled_prizes": { - "items": { - "type": "integer" - }, - "title": "Shuffled Prizes", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 0, + "description": "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run", + "title": "Shuffled Prizes" }, "random_terminals": { - "items": { - "type": "integer" - }, - "title": "Random Terminals", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 0, + "description": "How many times to apply the given prizes to random nodes in the interactome", + "title": "Random Terminals" }, "seed": { - "items": { - "type": "integer" - }, - "title": "Seed", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The random seed to use for this run. Defaults to the current UNIX timestamp.", + "title": "Seed" }, "w": { - "items": { - "type": "integer" - }, - "title": "W", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "the number of trees", + "title": "W" }, "b": { - "items": { - "type": "number" - }, - "title": "B", - "type": "array" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "the trade-off between including more terminals and using less reliable edges", + "title": "B" }, "d": { - "items": { - "type": "integer" - }, - "title": "D", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "controls the maximum path-length from v0 to terminal nodes", + "title": "D" }, "mu": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Mu", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "controls the degree-based negative prizes (defualt 0.0)", + "title": "Mu" }, "noise": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Noise", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations", + "title": "Noise" }, "g": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "G", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "(Gamma) multiplicative edge penalty from degree of endpoints", + "title": "G" }, "r": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "R", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)", + "title": "R" } }, "required": [ - "dummy_mode", - "mu_squared", - "exclude_terms", - "noisy_edges", - "shuffled_prizes", - "random_terminals", - "seed", "w", "b", - "d", - "mu", - "noise", - "g", - "r" + "d" ], "title": "omicsintegrator1RunModel", "type": "object" @@ -794,100 +1019,185 @@ "omicsintegrator2RunModel": { "properties": { "w": { - "items": { - "type": "number" - }, - "title": "W", - "type": "array" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 6, + "description": "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode", + "title": "W" }, "b": { - "items": { - "type": "number" - }, - "title": "B", - "type": "array" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 1, + "description": "Beta: scaling factor of prizes", + "title": "B" }, "g": { - "items": { - "type": "number" - }, - "title": "G", - "type": "array" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 20, + "description": "Gamma: multiplicative edge penalty from degree of endpoints", + "title": "G" }, "noise": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Noise", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations.", + "title": "Noise" }, "noisy_edges": { - "items": { - "anyOf": [ - { - "type": "integer" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Noisy Edges", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "An integer specifying how many times to add noise to the given edge values and re-run.", + "title": "Noisy Edges" }, "random_terminals": { - "items": { - "anyOf": [ - { - "type": "integer" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Random Terminals", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run", + "title": "Random Terminals" }, "dummy_mode": { - "items": { - "anyOf": [ - { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Dummy Mode", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals)\n \"terminals\" = connect to all terminals\n \"others\" = connect to all nodes except for terminals\n \"all\" = connect to all nodes in the interactome.", + "title": "Dummy Mode" }, "seed": { - "items": { - "type": "integer" - }, - "title": "Seed", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The random seed to use for this run. Defaults to the current UNIX timestamp.", + "title": "Seed" } }, - "required": [ - "w", - "b", - "g", - "noise", - "noisy_edges", - "random_terminals", - "dummy_mode", - "seed" - ], "title": "omicsintegrator2RunModel", "type": "object" }, @@ -921,16 +1231,25 @@ "pathlinkerRunModel": { "properties": { "k": { - "items": { - "type": "integer" - }, - "title": "K", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 100, + "description": "path length", + "title": "K" } }, - "required": [ - "k" - ], "title": "pathlinkerRunModel", "type": "object" }, @@ -964,30 +1283,52 @@ "rwrRunModel": { "properties": { "threshold": { - "items": { - "type": "integer" - }, - "title": "Threshold", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The number of nodes to return", + "title": "Threshold" }, "alpha": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Alpha", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The chance of a restart during the random walk", + "title": "Alpha" } }, "required": [ - "threshold", - "alpha" + "threshold" ], "title": "rwrRunModel", "type": "object" @@ -1022,30 +1363,52 @@ "strwrRunModel": { "properties": { "threshold": { - "items": { - "type": "integer" - }, - "title": "Threshold", - "type": "array" + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The number of nodes to return", + "title": "Threshold" }, "alpha": { - "items": { - "anyOf": [ - { - "type": "number" + "anyOf": [ + { + "type": "number" + }, + { + "items": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ] }, - { - "type": "null" - } - ] - }, - "title": "Alpha", - "type": "array" + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The chance of a restart during the random walk", + "title": "Alpha" } }, "required": [ - "threshold", - "alpha" + "threshold" ], "title": "strwrRunModel", "type": "object" diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index bc7b896fc..bf0f13750 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -3,13 +3,40 @@ parameter combinations. This has been isolated from schema.py as it is not declarative, and rather mainly contains validators and lower-level pydantic code. """ -from typing import Any, cast, Union, Literal +from typing import Annotated, Any, Callable, cast, Union, Literal from spras.runner import algorithms -from pydantic import BaseModel, create_model +from pydantic import BaseModel, BeforeValidator, create_model __all__ = ['AlgorithmUnion'] +def is_numpy_friendly(type: type[Any] | None) -> bool: + """ + Whether the passed in type can have any numpy helpers. + This is mainly used to provide hints in the JSON schema. + """ + return type in (int, float) + +def python_evalish_coerce(type: type[Any] | None) -> Callable[[Any], Any]: + """ + Allows for using numpy and python calls + """ + + def numpy_coerce_validator(value: Any) -> Any: + raise NotImplementedError + + return numpy_coerce_validator + + +def list_coerce(value: Any) -> Any: + """ + Coerces to a value to a list if it isn't already. + Used as a BeforeValidator. + """ + if not isinstance(value, list): + return [value] + return value + def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseModel]: """ Dynamically constructs a parameter-combination model based on the original args model. @@ -26,14 +53,32 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod # class AlgorithmParamsCombination(BaseModel): # key1: list[int] # key2: list[list[str]] - # This function does not worry about getting the cartesian product of this. + # However, we want to preserve certain conveniences (singleton values, fake python evaluation), + # so we also make use of BeforeValidators to do so, and we pass over their preferences into the JSON schema. + # (Note: This function does not worry about getting the cartesian product of this.) - # Map our fields to a list (assuming we have no nested keys) - mapped_list_field: dict[str, type[list[Any]]] = {name: list[field.annotation] for name, field in model.model_fields.items()} + # Map our fields to a list (assuming we have no nested keys), + # and specify our user convenience validators + mapped_list_field: dict[str, Annotated] = { + name: (Annotated[ + list[field.annotation], + # This order isn't arbitrary. + # https://docs.pydantic.dev/latest/concepts/validators/#ordering-of-validators + # This runs second. This coerces any singletons to lists. + BeforeValidator(list_coerce), + # This runs first. This evaluates numpy utils for integer/float lists + BeforeValidator( + python_evalish_coerce(field.annotation), + # json_schema_input_type (sensibly) overwrites, so we only specify it here. + json_schema_input_type=Union[field.annotation, list[field.annotation], str] if is_numpy_friendly(field.annotation) else \ + Union[field.annotation, list[field.annotation]] + ) + ], field) for name, field in model.model_fields.items() + } # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields for key in mapped_list_field.keys(): - assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema." + \ + assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema. " + \ "This should have been caught by the Snakemake CI step." # Pass this as kwargs to create_model, which usually takes in parameters field_name=type. From 72c4cbd4dbbf9cedf607010f659e5811226ab830 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 15:56:13 +0000 Subject: [PATCH 15/68] feat: default runs for default algorithms --- config/config.yaml | 1 + config/schema.json | 64 +++++++++++++++++++++++++++++--------- spras/config/algorithms.py | 14 ++++++--- spras/runner.py | 26 +++++++++------- 4 files changed, 75 insertions(+), 30 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 3e2127d53..49ae31f4f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -95,6 +95,7 @@ algorithms: include: true - name: "domino" + include: true params: include: true run1: diff --git a/config/schema.json b/config/schema.json index 01494a4ea..649b815c6 100644 --- a/config/schema.json +++ b/config/schema.json @@ -315,14 +315,16 @@ "additionalProperties": { "$ref": "#/$defs/allpairsRunModel" }, + "default": { + "default": {} + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "allpairsModel", "type": "object" @@ -347,14 +349,16 @@ "additionalProperties": { "$ref": "#/$defs/bowtiebuilderRunModel" }, + "default": { + "default": {} + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "bowtiebuilderModel", "type": "object" @@ -379,14 +383,20 @@ "additionalProperties": { "$ref": "#/$defs/dominoRunModel" }, + "default": { + "default": { + "_time": 1752594898.608572, + "module_threshold": null, + "slice_threshold": null + } + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "dominoModel", "type": "object" @@ -482,14 +492,20 @@ "additionalProperties": { "$ref": "#/$defs/meoRunModel" }, + "default": { + "default": { + "max_path_length": null, + "local_search": null, + "rand_restarts": null + } + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "meoModel", "type": "object" @@ -593,14 +609,19 @@ "additionalProperties": { "$ref": "#/$defs/mincostflowRunModel" }, + "default": { + "default": { + "flow": null, + "capacity": null + } + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "mincostflowModel", "type": "object" @@ -1004,14 +1025,25 @@ "additionalProperties": { "$ref": "#/$defs/omicsintegrator2RunModel" }, + "default": { + "default": { + "w": 6.0, + "b": 1.0, + "g": 20.0, + "noise": null, + "noisy_edges": null, + "random_terminals": null, + "dummy_mode": null, + "seed": 1752594898608 + } + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "omicsintegrator2Model", "type": "object" @@ -1216,14 +1248,18 @@ "additionalProperties": { "$ref": "#/$defs/pathlinkerRunModel" }, + "default": { + "default": { + "k": 100 + } + }, "title": "Runs", "type": "object" } }, "required": [ "name", - "include", - "runs" + "include" ], "title": "pathlinkerModel", "type": "object" diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index bf0f13750..f129594db 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -3,7 +3,7 @@ parameter combinations. This has been isolated from schema.py as it is not declarative, and rather mainly contains validators and lower-level pydantic code. """ -from typing import Annotated, Any, Callable, cast, Union, Literal +from typing import Annotated, Any, Callable, cast, Optional, Union, Literal from spras.runner import algorithms from pydantic import BaseModel, BeforeValidator, create_model @@ -37,7 +37,7 @@ def list_coerce(value: Any) -> Any: return [value] return value -def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseModel]: +def construct_algorithm_model(name: str, model: type[BaseModel], model_default: Optional[BaseModel]) -> type[BaseModel]: """ Dynamically constructs a parameter-combination model based on the original args model. This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection @@ -100,8 +100,14 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod f'{name}Model', name=Literal[name], include=bool, - runs=dict[str, run_model] + # For algorithms that have a default parameter config, we allow arbitrarily running an algorithm + # if no runs are specified. For example, the following config + # name: pathlinker + # include: true + # will run, despite there being no entries in `runs`. + # (create_model entries take in either a type or (type, default)). + runs=dict[str, run_model] if model_default is None else (dict[str, run_model], {"default": model_default}) ) -algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model) for name, (_, model) in algorithms.items()] +algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model, model_default) for name, (_, model, model_default) in algorithms.items()] AlgorithmUnion = Union[tuple(algorithm_models)] diff --git a/spras/runner.py b/spras/runner.py index 843b3cf46..4f603f9b9 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Optional from pydantic import BaseModel @@ -17,17 +17,19 @@ from spras.rwr import RWR, RWRParams from spras.strwr import ST_RWR, ST_RWRParams -algorithms: dict[str, tuple[type[PRM], type[BaseModel]]] = { - "allpairs": (AllPairs, Empty), - "bowtiebuilder": (BowTieBuilder, Empty), - "domino": (DOMINO, DominoParams), - "meo": (MEO, MEOParams), - "mincostflow": (MinCostFlow, MinCostFlowParams), - "omicsintegrator1": (OmicsIntegrator1, OmicsIntegrator1Params), - "omicsintegrator2": (OmicsIntegrator2, OmicsIntegrator2Params), - "pathlinker": (PathLinker, PathLinkerParams), - "rwr": (RWR, RWRParams), - "strwr": (ST_RWR, ST_RWRParams), +# Algorithm names to a three-tuple of (PRM, BaseModel, default BaseModel or None if there are no good defaults). +# This is used for the configuration and to fetch algorithms during reconstruction +algorithms: dict[str, tuple[type[PRM], type[BaseModel], Optional[BaseModel]]] = { + "allpairs": (AllPairs, Empty, Empty()), + "bowtiebuilder": (BowTieBuilder, Empty, Empty()), + "domino": (DOMINO, DominoParams, DominoParams()), + "meo": (MEO, MEOParams, MEOParams()), + "mincostflow": (MinCostFlow, MinCostFlowParams, MinCostFlowParams()), + "omicsintegrator1": (OmicsIntegrator1, OmicsIntegrator1Params, None), + "omicsintegrator2": (OmicsIntegrator2, OmicsIntegrator2Params, OmicsIntegrator2Params()), + "pathlinker": (PathLinker, PathLinkerParams, PathLinkerParams()), + "rwr": (RWR, RWRParams, None), + "strwr": (ST_RWR, ST_RWRParams, None), } def get_algorithm(algorithm: str) -> type[PRM]: From 2ef26727221583cc7ef6c613e651d15a25c6b0e8 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 17:11:33 +0000 Subject: [PATCH 16/68] feat: function running --- config/schema.json | 4 +-- spras/config/algorithms.py | 55 ++++++++++++++++++++++++++++++-------- spras/omicsintegrator1.py | 3 +-- util/play.py | 5 ++++ 4 files changed, 52 insertions(+), 15 deletions(-) create mode 100644 util/play.py diff --git a/config/schema.json b/config/schema.json index 649b815c6..be41b5b3d 100644 --- a/config/schema.json +++ b/config/schema.json @@ -385,7 +385,7 @@ }, "default": { "default": { - "_time": 1752594898.608572, + "_time": 1752596079.9888437, "module_threshold": null, "slice_threshold": null } @@ -1034,7 +1034,7 @@ "noisy_edges": null, "random_terminals": null, "dummy_mode": null, - "seed": 1752594898608 + "seed": 1752596079988 } }, "title": "Runs", diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index f129594db..b5c42199d 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -3,8 +3,10 @@ parameter combinations. This has been isolated from schema.py as it is not declarative, and rather mainly contains validators and lower-level pydantic code. """ +import ast from typing import Annotated, Any, Callable, cast, Optional, Union, Literal +import numpy as np from spras.runner import algorithms from pydantic import BaseModel, BeforeValidator, create_model @@ -17,16 +19,48 @@ def is_numpy_friendly(type: type[Any] | None) -> bool: """ return type in (int, float) -def python_evalish_coerce(type: type[Any] | None) -> Callable[[Any], Any]: +def python_evalish_coerce(value: Any) -> Any: """ - Allows for using numpy and python calls + Allows for using numpy and python calls. + + **Safety Note**: This does not prevent availability attacks: this can still exhaust + resources if wanted. This only prevents secret leakage. """ + + if not isinstance(value, str): + return value + + # These strings are in the form of function calls `function.name(param1, param2, ...)`. + # Since we want to avoid `eval` (since this might be running in the secret-sensitive HTCondor), + # we need to parse these functions. + functions_dict: dict[str, Callable[[list[Any]], list[Union[int, float]]]] = { + 'range': lambda params: list(range(*params)), + "np.linspace": lambda params: list(np.linspace(*params)), + "np.arange": lambda params: list(np.arange(*params)), + "np.logspace": lambda params: list(np.logspace(*params)), + } + + # To do this, we get the AST of our string as an expression + value_ast = ast.parse(value, mode='eval') + + # Then we do some light parsing - we're only looking to do some literal evaluation + # (e.g. allowing 1+1) and some basic function parsing. Full python programs + # should just generate a config.yaml. + + # This should always be an Expression whose body is Call (a function). + if not isinstance(value_ast.body, ast.Call): + raise ValueError(f'The python code "{value}" should be calling a function directly. Is this meant to be python code?') - def numpy_coerce_validator(value: Any) -> Any: - raise NotImplementedError + # We get the function name back as a string + function_name = ast.unparse(value_ast.body.func) - return numpy_coerce_validator + # and we use the (non-availability) safe `ast.literal_eval` to support light expressions. + arguments = [ast.literal_eval(arg) for arg in value_ast.body.args] + if function_name not in functions_dict: + raise ValueError(f"{function_name} is not an allowed function to be run!") + + return functions_dict[function_name](arguments) def list_coerce(value: Any) -> Any: """ @@ -65,14 +99,13 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: # This order isn't arbitrary. # https://docs.pydantic.dev/latest/concepts/validators/#ordering-of-validators # This runs second. This coerces any singletons to lists. - BeforeValidator(list_coerce), + BeforeValidator(list_coerce, json_schema_input_type=Union[field.annotation, list[field.annotation]]), # This runs first. This evaluates numpy utils for integer/float lists BeforeValidator( - python_evalish_coerce(field.annotation), - # json_schema_input_type (sensibly) overwrites, so we only specify it here. - json_schema_input_type=Union[field.annotation, list[field.annotation], str] if is_numpy_friendly(field.annotation) else \ - Union[field.annotation, list[field.annotation]] - ) + python_evalish_coerce, + # json_schema_input_type (sensibly) overwrites, so we have to specify the entire union again here. + json_schema_input_type=Union[field.annotation, list[field.annotation], str] + ) if is_numpy_friendly(field.annotation) else None ], field) for name, field in model.model_fields.items() } diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index ddb934bb5..013eced8d 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -209,8 +209,7 @@ def run(inputs, output_file, args, container_framework="docker"): command.extend(['--noisyEdges', str(args.noisy_edges)]) command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) command.extend(['--randomTerminals', str(args.random_terminals)]) - if args.seed is not None: - command.extend(['--seed', str(args.seed)]) + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-1:no-conda" # no-conda version is the default run_container_and_log('Omics Integrator 1', diff --git a/util/play.py b/util/play.py new file mode 100644 index 000000000..f53ae9f53 --- /dev/null +++ b/util/play.py @@ -0,0 +1,5 @@ +import ast +value_ast = ast.parse("np.range.test(1, 2, 3)", mode='eval') +# print(ast.dump(value_ast.body, indent=2)) +assert isinstance(value_ast.body, ast.Call) +print([ast.literal_eval(arg) for arg in value_ast.body.args]) \ No newline at end of file From 9442b6496251823456670092f12b404dba19c76c Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 17:13:45 +0000 Subject: [PATCH 17/68] chore: drop play --- util/play.py | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 util/play.py diff --git a/util/play.py b/util/play.py deleted file mode 100644 index f53ae9f53..000000000 --- a/util/play.py +++ /dev/null @@ -1,5 +0,0 @@ -import ast -value_ast = ast.parse("np.range.test(1, 2, 3)", mode='eval') -# print(ast.dump(value_ast.body, indent=2)) -assert isinstance(value_ast.body, ast.Call) -print([ast.literal_eval(arg) for arg in value_ast.body.args]) \ No newline at end of file From 60b562f45a89a61d863a4bc6422cf87ce9fe81b7 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 17:21:36 +0000 Subject: [PATCH 18/68] fix(config): don't try to parse in config.py --- spras/config/config.py | 42 +++++++----------------------------------- 1 file changed, 7 insertions(+), 35 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 252d6ccf5..72f08f330 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -164,47 +164,19 @@ def process_algorithms(self, raw_config: RawConfig): # Do not parse the rest of the parameters for this algorithm if it is not included continue - if cur_params.directed is not None: - warnings.warn("UPDATE: we no longer use the directed key in the config file", stacklevel=2) - - cur_params = cur_params.__pydantic_extra__ - if cur_params is None: - raise RuntimeError("An internal error occured: ConfigDict extra should be set on AlgorithmParams.") - - # The algorithm has no named arguments so create a default placeholder - if len(cur_params.keys()) == 0: - cur_params["run1"] = {"spras_placeholder": ["no parameters"]} + runs: dict[str, Any] = cur_params.runs # Each set of runs should be 1 level down in the config file - for run_params in cur_params: + for run_name in runs.keys(): all_runs = [] # We create the product of all param combinations for each run param_name_list = [] - if cur_params[run_params]: - for p in cur_params[run_params]: - param_name_list.append(p) - obj = str(cur_params[run_params][p]) - try: - obj = [int(obj)] - except ValueError: - try: - obj = [float(obj)] - except ValueError: - # Handles arrays and special evaluation types - # TODO: do we want to explicitly bar `eval` if we may use untrusted user inputs later? - if obj.startswith(("range", "np.linspace", "np.arange", "np.logspace", "[")): - obj = eval(obj) - elif obj.lower() == "true": - obj = [True] - elif obj.lower() == "false": - obj = [False] - else: - # Catch-all for strings - obj = [obj] - if not isinstance(obj, Iterable): - raise ValueError(f"The object `{obj}` in algorithm {alg.name} at key '{p}' in run '{run_params}' is not iterable!") from None - all_runs.append(obj) + for param in runs[run_name]: + param_name_list.append(param) + # this is guaranteed to be list[Any] by algorithms.py + param_values: list[Any] = runs[run_name][param] + all_runs.append(param_values) run_list_tuples = list(it.product(*all_runs)) param_name_tuple = tuple(param_name_list) for r in run_list_tuples: From c1947e67409b90cd07a2f632335302f8f6422554 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 18:32:30 +0000 Subject: [PATCH 19/68] fix: subscriptability --- spras/config/algorithms.py | 17 ++++-- spras/config/config.py | 10 ++-- spras/config/schema.py | 3 - spras/omicsintegrator2.py | 8 ++- test/test_config.py | 109 ++++++++++++++++--------------------- 5 files changed, 70 insertions(+), 77 deletions(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index b5c42199d..fbc7a2230 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -4,11 +4,11 @@ and rather mainly contains validators and lower-level pydantic code. """ import ast -from typing import Annotated, Any, Callable, cast, Optional, Union, Literal +from typing import Annotated, Any, Callable, cast, get_args, Optional, Union, Literal import numpy as np from spras.runner import algorithms -from pydantic import BaseModel, BeforeValidator, create_model +from pydantic import BaseModel, BeforeValidator, create_model, Field __all__ = ['AlgorithmUnion'] @@ -17,7 +17,11 @@ def is_numpy_friendly(type: type[Any] | None) -> bool: Whether the passed in type can have any numpy helpers. This is mainly used to provide hints in the JSON schema. """ - return type in (int, float) + allowed_types = (int, float) + + # check basic types, then check optional types + return type in allowed_types or \ + any([arg for arg in get_args(type) if arg in allowed_types]) def python_evalish_coerce(value: Any) -> Any: """ @@ -41,10 +45,10 @@ def python_evalish_coerce(value: Any) -> Any: } # To do this, we get the AST of our string as an expression - value_ast = ast.parse(value, mode='eval') + value_ast = ast.parse(value, mode='eval', filename='config.yaml') # Then we do some light parsing - we're only looking to do some literal evaluation - # (e.g. allowing 1+1) and some basic function parsing. Full python programs + # (allowing light python notation) and some basic function parsing. Full python programs # should just generate a config.yaml. # This should always be an Expression whose body is Call (a function). @@ -143,4 +147,5 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: ) algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model, model_default) for name, (_, model, model_default) in algorithms.items()] -AlgorithmUnion = Union[tuple(algorithm_models)] +# name differentriates algorithms +AlgorithmUnion = Annotated[Union[tuple(algorithm_models)], Field(discriminator='name')] diff --git a/spras/config/config.py b/spras/config/config.py index 72f08f330..6eeb760a7 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -156,15 +156,14 @@ def process_algorithms(self, raw_config: RawConfig): self.algorithm_directed = dict() self.algorithms = raw_config.algorithms for alg in self.algorithms: - cur_params = alg.params - if cur_params.include: + if alg.include: # This dict maps from parameter combinations hashes to parameter combination dictionaries self.algorithm_params[alg.name] = dict() else: # Do not parse the rest of the parameters for this algorithm if it is not included continue - runs: dict[str, Any] = cur_params.runs + runs: dict[str, Any] = alg.runs # Each set of runs should be 1 level down in the config file for run_name in runs.keys(): @@ -172,10 +171,11 @@ def process_algorithms(self, raw_config: RawConfig): # We create the product of all param combinations for each run param_name_list = [] - for param in runs[run_name]: + run_subscriptable = vars(runs[run_name]) + for param in run_subscriptable: param_name_list.append(param) # this is guaranteed to be list[Any] by algorithms.py - param_values: list[Any] = runs[run_name][param] + param_values: list[Any] = run_subscriptable[param] all_runs.append(param_values) run_list_tuples = list(it.product(*all_runs)) param_name_tuple = tuple(param_name_list) diff --git a/spras/config/schema.py b/spras/config/schema.py index 7657a41a0..fc502b677 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -131,6 +131,3 @@ class RawConfig(BaseModel): reconstruction_settings: ReconstructionSettings model_config = ConfigDict(extra='forbid') - -# AlgorithmUnion is dynamically constructed. -RawConfig.model_rebuild() diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 5e8e73ef0..d92ba77d2 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -5,6 +5,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict, Field +from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_undirected @@ -13,6 +14,11 @@ __all__ = ['OmicsIntegrator2', 'OmicsIntegrator2Params'] +class DummyMode(CaseInsensitiveEnum): + terminals = 'terminals' + others = 'others' + all = 'all' + class OmicsIntegrator2Params(BaseModel): w: float = 6 "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode" @@ -32,7 +38,7 @@ class OmicsIntegrator2Params(BaseModel): random_terminals: Optional[int] = None "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run" - dummy_mode: Optional[str] = None + dummy_mode: Optional[DummyMode] = None """ Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals) "terminals" = connect to all terminals diff --git a/test/test_config.py b/test/test_config.py index 6095ad145..b0031d029 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -5,6 +5,7 @@ import spras.config.config as config from spras.config.schema import DEFAULT_HASH_LENGTH +from spras.config.container_schema import DEFAULT_CONTAINER_PREFIX filler_dataset_data: dict[str, str | list[str]] = { "data_dir": "fake", @@ -18,10 +19,12 @@ # individual values of the dict can be changed and the whole initialization can be re-run. def get_test_config(): test_raw_config = { - "container_framework": "singularity", - "container_registry": { - "base_url": "docker.io", - "owner": "reedcompbio", + "containers": { + "framework": "singularity", + "registry": { + "base_url": "docker.io", + "owner": "reedcompbio", + }, }, "hash_length": 7, "reconstruction_settings": { @@ -49,55 +52,37 @@ def get_test_config(): "data_dir": "gs-fake" }], "algorithms": [ + # Since there is algorithm validation, + # we are (mostly) forced to use real algorithm parameters here. + # To make this more readable, we make the 'test names' the run names. + # TODO: we don't have a test for combinations of strings anymore. This seems to be fine, + # but it would be nice to have once we introduce an algorithm that takes more than 1 string parameter. { - "name": "strings", - "params": { - "include": True, - "run1": {"test": "str1", "test2": ["str2", "str3"]} - } - }, - { - "name": "numbersAndBools", - "params": { - "include": True, - "run1": {"a": 1, "b": [float(2.0), 3], "c": [4], "d": float(5.6), "f": False} - } - }, - { - "name": "singleton_int64_with_array", - "params": { - "include": True, - "run1": {"test": np.int64(1), "test2": [2, 3]} + "name": "omicsintegrator2", + "include": True, + "runs": { + "strings": {"dummyMode": ["terminals", "others"], "b": 1}, + # spacing in np.linspace is on purpose + "singleton_string_np_linspace": {"dummyMode": "terminals", "b": "np.linspace(0, 5,2)"}, + "str_array_np_logspace": {"test": ["others", "all"], "g": "np.logspace(1,1)"} } }, { - "name": "singleton_string_np_linspace", - "params": { - "include": True, - "run1": {"test": "str1", "test2": "np.linspace(0,5,2)"} + "name": "meo", + "include": True, + "runs": { + "numbersAndBool": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": True}, + "numbersAndBools": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, + "boolArrTest": {"local_search": [True, False], "max_path_length": "range(1, 3)"} } }, { - "name": "str_array_np_logspace", - "params": { - "include": True, - "run1": {"test": ["a", "b"], "test2": "np.logspace(1,1)"} + "name": "mincostflow", + "include": True, + "runs": { + "int64artifact": {"flow": "np.arange(5,6)", "capacity": [2, 3]} } }, - { - "name": "int64artifact", - "params": { - "include": True, - "run1": {"test": "np.arange(5,6)", "test2": [2, 3]} - } - }, - { - "name": "boolArrTest", - "params": { - "include": True, - "run1": {"flags": [True, False], "range": "range(1, 3)"} - } - } ], "analysis": { "summary": { @@ -159,46 +144,46 @@ def test_config_container_framework_normalization(self): # Test singularity test_config = get_test_config() - test_config["container_framework"] = "singularity" + test_config["containers"]["framework"] = "singularity" config.init_global(test_config) - assert (config.config.container_framework == "singularity") + assert (config.config.container_settings.framework == "singularity") # Test singularity with capitalization - test_config["container_framework"] = "Singularity" + test_config["containers"]["framework"] = "Singularity" config.init_global(test_config) - assert (config.config.container_framework == "singularity") + assert (config.config.container_settings.framework == "singularity") # Test docker - test_config["container_framework"] = "docker" + test_config["containers"]["framework"] = "docker" config.init_global(test_config) - assert (config.config.container_framework == "docker") + assert (config.config.container_settings.framework == "docker") # Test docker with capitalization - test_config["container_framework"] = "Docker" + test_config["containers"]["framework"] = "Docker" config.init_global(test_config) - assert (config.config.container_framework == "docker") + assert (config.config.container_settings.framework == "docker") # Test unknown framework - test_config["container_framework"] = "badFramework" + test_config["containers"]["framework"] = "badFramework" with pytest.raises(ValueError): config.init_global(test_config) def test_config_container_registry(self): test_config = get_test_config() - test_config["container_registry"]["base_url"] = "docker.io" - test_config["container_registry"]["owner"] = "reedcompbio" + test_config["containers"]["registry"]["base_url"] = "docker.io" + test_config["containers"]["registry"]["owner"] = "reedcompbio" config.init_global(test_config) - assert (config.config.container_prefix == "docker.io/reedcompbio") + assert (config.config.container_settings.prefix == "docker.io/reedcompbio") - test_config["container_registry"]["base_url"] = "another.repo" - test_config["container_registry"]["owner"] = "different-owner" + test_config["containers"]["registry"]["base_url"] = "another.repo" + test_config["containers"]["registry"]["owner"] = "different-owner" config.init_global(test_config) - assert (config.config.container_prefix == "another.repo/different-owner") + assert (config.config.container_settings.prefix == "another.repo/different-owner") - test_config["container_registry"]["base_url"] = "" - test_config["container_registry"]["owner"] = "" + test_config["containers"]["registry"]["base_url"] = "" + test_config["containers"]["registry"]["owner"] = "" config.init_global(test_config) - assert (config.config.container_prefix == config.DEFAULT_CONTAINER_PREFIX) + assert (config.config.container_settings.prefix == DEFAULT_CONTAINER_PREFIX) def test_error_dataset_label(self): test_config = get_test_config() From 8beaf72a7e7f64c42cc543a0f31b77fdf99485e3 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 20:03:48 +0000 Subject: [PATCH 20/68] fix: auto-discriminator mapping & forbid --- config/egfr.yaml | 71 ++++++++------------ config/schema.json | 107 ++++++++++++++++++++++++++++--- spras/config/algorithms.py | 6 +- spras/config/container_schema.py | 11 +++- spras/domino.py | 2 +- spras/meo.py | 2 +- spras/mincostflow.py | 2 +- spras/omicsintegrator1.py | 2 +- spras/omicsintegrator2.py | 2 +- spras/pathlinker.py | 2 +- spras/rwr.py | 2 +- spras/strwr.py | 2 +- 12 files changed, 143 insertions(+), 68 deletions(-) diff --git a/config/egfr.yaml b/config/egfr.yaml index 9b4ccc45b..106963c62 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -1,41 +1,25 @@ -# The length of the hash used to identify a parameter combination -hash_length: 7 - -# Specify the container framework used by each PRM wrapper. Valid options include: -# - docker (default if not specified) -# - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed -# - dsub -- experimental with limited support, used for running on Google Cloud -container_framework: docker +# yaml-language-server: $schema=./schema.json -# Only used if container_framework is set to singularity, this will unpack the singularity containers -# to the local filesystem. This is useful when PRM containers need to run inside another container, -# such as would be the case in an HTCondor/OSPool environment. -# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way -# that persists after the workflow is complete. To clean up the unpacked containers, the user must -# manually delete them. -unpack_singularity: false - -# Allow the user to configure which container registry containers should be pulled from -# Note that this assumes container names are consistent across registries, and that the -# registry being passed doesn't require authentication for pull actions -container_registry: - base_url: docker.io - # The owner or project of the registry - # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs - owner: reedcompbio +hash_length: 7 +containers: + framework: docker + unpack_singularity: false + registry: + base_url: docker.io + owner: reedcompbio algorithms: - name: pathlinker - params: - include: true + include: true + runs: run1: k: - 10 - 20 - 70 - name: omicsintegrator1 - params: - include: true + include: true + runs: run1: b: - 0.55 @@ -53,8 +37,8 @@ algorithms: - 0.008 dummy_mode: ["file"] - name: omicsintegrator2 - params: - include: true + include: true + runs: run1: b: - 4 @@ -66,36 +50,31 @@ algorithms: g: - 3 - name: meo - params: - include: true + include: true + runs: run1: - local_search: - - "Yes" + local_search: true max_path_length: - 3 rand_restarts: - 10 run2: - local_search: - - "No" - max_path_length: - - 2 - rand_restarts: - - 10 + local_search: false + max_path_length: 2 + rand_restarts: 10 - name: allpairs - params: - include: true + include: true - name: domino - params: - include: true + include: true + runs: run1: slice_threshold: - 0.3 module_threshold: - 0.05 - name: mincostflow - params: - include: true + include: true + runs: run1: capacity: - 15 diff --git a/config/schema.json b/config/schema.json index be41b5b3d..c15dcaa8a 100644 --- a/config/schema.json +++ b/config/schema.json @@ -50,23 +50,23 @@ "additionalProperties": false, "properties": { "base_url": { + "default": "docker.io", + "description": "The domain of the registry", "title": "Base Url", "type": "string" }, "owner": { + "default": "reedcompbio", "description": "The owner or project of the registry", "title": "Owner", "type": "string" } }, - "required": [ - "base_url", - "owner" - ], "title": "ContainerRegistry", "type": "object" }, "ContainerSettings": { + "additionalProperties": false, "properties": { "framework": { "$ref": "#/$defs/ContainerFramework", @@ -149,6 +149,15 @@ "title": "Dataset", "type": "object" }, + "DummyMode": { + "enum": [ + "terminals", + "others", + "all" + ], + "title": "DummyMode", + "type": "string" + }, "EvaluationAnalysis": { "additionalProperties": false, "properties": { @@ -301,6 +310,7 @@ "type": "object" }, "allpairsModel": { + "additionalProperties": false, "properties": { "name": { "const": "allpairs", @@ -330,11 +340,13 @@ "type": "object" }, "allpairsRunModel": { + "additionalProperties": false, "properties": {}, "title": "allpairsRunModel", "type": "object" }, "bowtiebuilderModel": { + "additionalProperties": false, "properties": { "name": { "const": "bowtiebuilder", @@ -364,11 +376,13 @@ "type": "object" }, "bowtiebuilderRunModel": { + "additionalProperties": false, "properties": {}, "title": "bowtiebuilderRunModel", "type": "object" }, "dominoModel": { + "additionalProperties": false, "properties": { "name": { "const": "domino", @@ -385,7 +399,7 @@ }, "default": { "default": { - "_time": 1752596079.9888437, + "_time": 1752606304.38952, "module_threshold": null, "slice_threshold": null } @@ -402,6 +416,7 @@ "type": "object" }, "dominoRunModel": { + "additionalProperties": false, "properties": { "_time": { "anyOf": [ @@ -439,6 +454,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -465,6 +483,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -478,6 +499,7 @@ "type": "object" }, "meoModel": { + "additionalProperties": false, "properties": { "name": { "const": "meo", @@ -511,6 +533,7 @@ "type": "object" }, "meoRunModel": { + "additionalProperties": false, "properties": { "max_path_length": { "anyOf": [ @@ -530,6 +553,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -582,6 +608,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -595,6 +624,7 @@ "type": "object" }, "mincostflowModel": { + "additionalProperties": false, "properties": { "name": { "const": "mincostflow", @@ -627,6 +657,7 @@ "type": "object" }, "mincostflowRunModel": { + "additionalProperties": false, "properties": { "flow": { "anyOf": [ @@ -646,6 +677,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -672,6 +706,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -685,6 +722,7 @@ "type": "object" }, "omicsintegrator1Model": { + "additionalProperties": false, "properties": { "name": { "const": "omicsintegrator1", @@ -712,6 +750,7 @@ "type": "object" }, "omicsintegrator1RunModel": { + "additionalProperties": false, "properties": { "dummy_mode": { "anyOf": [ @@ -915,6 +954,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -941,6 +983,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -967,6 +1012,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -993,6 +1041,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1011,6 +1062,7 @@ "type": "object" }, "omicsintegrator2Model": { + "additionalProperties": false, "properties": { "name": { "const": "omicsintegrator2", @@ -1034,7 +1086,7 @@ "noisy_edges": null, "random_terminals": null, "dummy_mode": null, - "seed": 1752596079988 + "seed": 1752606304389 } }, "title": "Runs", @@ -1049,6 +1101,7 @@ "type": "object" }, "omicsintegrator2RunModel": { + "additionalProperties": false, "properties": { "w": { "anyOf": [ @@ -1125,6 +1178,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1151,6 +1207,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1177,6 +1236,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1188,13 +1250,13 @@ "dummy_mode": { "anyOf": [ { - "type": "string" + "$ref": "#/$defs/DummyMode" }, { "items": { "anyOf": [ { - "type": "string" + "$ref": "#/$defs/DummyMode" }, { "type": "null" @@ -1234,6 +1296,7 @@ "type": "object" }, "pathlinkerModel": { + "additionalProperties": false, "properties": { "name": { "const": "pathlinker", @@ -1265,6 +1328,7 @@ "type": "object" }, "pathlinkerRunModel": { + "additionalProperties": false, "properties": { "k": { "anyOf": [ @@ -1290,6 +1354,7 @@ "type": "object" }, "rwrModel": { + "additionalProperties": false, "properties": { "name": { "const": "rwr", @@ -1317,6 +1382,7 @@ "type": "object" }, "rwrRunModel": { + "additionalProperties": false, "properties": { "threshold": { "anyOf": [ @@ -1354,6 +1420,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1370,6 +1439,7 @@ "type": "object" }, "strwrModel": { + "additionalProperties": false, "properties": { "name": { "const": "strwr", @@ -1397,6 +1467,7 @@ "type": "object" }, "strwrRunModel": { + "additionalProperties": false, "properties": { "threshold": { "anyOf": [ @@ -1434,6 +1505,9 @@ }, "type": "array" }, + { + "type": "string" + }, { "type": "null" } @@ -1463,7 +1537,22 @@ }, "algorithms": { "items": { - "anyOf": [ + "discriminator": { + "mapping": { + "allpairs": "#/$defs/allpairsModel", + "bowtiebuilder": "#/$defs/bowtiebuilderModel", + "domino": "#/$defs/dominoModel", + "meo": "#/$defs/meoModel", + "mincostflow": "#/$defs/mincostflowModel", + "omicsintegrator1": "#/$defs/omicsintegrator1Model", + "omicsintegrator2": "#/$defs/omicsintegrator2Model", + "pathlinker": "#/$defs/pathlinkerModel", + "rwr": "#/$defs/rwrModel", + "strwr": "#/$defs/strwrModel" + }, + "propertyName": "name" + }, + "oneOf": [ { "$ref": "#/$defs/allpairsModel" }, diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index fbc7a2230..32f6b82d3 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -8,7 +8,7 @@ import numpy as np from spras.runner import algorithms -from pydantic import BaseModel, BeforeValidator, create_model, Field +from pydantic import BaseModel, BeforeValidator, create_model, ConfigDict, Field __all__ = ['AlgorithmUnion'] @@ -123,6 +123,7 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: # have had a key that starts with __ in mapped_list_fields. The above assertion prevents this. run_model = (cast(Any, create_model))( f'{name}RunModel', + __config__=ConfigDict(extra='forbid'), **mapped_list_field ) @@ -143,7 +144,8 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: # include: true # will run, despite there being no entries in `runs`. # (create_model entries take in either a type or (type, default)). - runs=dict[str, run_model] if model_default is None else (dict[str, run_model], {"default": model_default}) + runs=dict[str, run_model] if model_default is None else (dict[str, run_model], {"default": model_default}), + __config__=ConfigDict(extra='forbid') ) algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model, model_default) for name, (_, model, model_default) in algorithms.items()] diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py index 9688a9b51..ea9881a30 100644 --- a/spras/config/container_schema.py +++ b/spras/config/container_schema.py @@ -22,10 +22,13 @@ class ContainerFramework(CaseInsensitiveEnum): dsub = 'dsub' class ContainerRegistry(BaseModel): - base_url: str - owner: str = Field(description="The owner or project of the registry") + base_url: str = "docker.io" + "The domain of the registry" - model_config = ConfigDict(extra='forbid') + owner: str = "reedcompbio" + "The owner or project of the registry" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) class ContainerSettings(BaseModel): framework: ContainerFramework = ContainerFramework.docker @@ -33,6 +36,8 @@ class ContainerSettings(BaseModel): registry: ContainerRegistry hash_length: int = 7 + model_config = ConfigDict(extra='forbid') + @dataclass class ProcessedContainerSettings: framework: ContainerFramework = ContainerFramework.docker diff --git a/spras/domino.py b/spras/domino.py index a9ce7a43b..521f89722 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -26,7 +26,7 @@ class DominoParams(NondeterministicModel): slice_threshold: Optional[float] = None "the p-value threshold for considering a putative module as final module (optional)" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ DOMINO will construct a fully undirected graph from the provided input file diff --git a/spras/meo.py b/spras/meo.py index 02edf07af..4b3f9299e 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -83,7 +83,7 @@ class MEOParams(BaseModel): rand_restarts: Optional[int] = None "The number of random restarts to do." - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ MEO can support partially directed graphs diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 2673d91e2..1f7ff0cf7 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -20,7 +20,7 @@ class MinCostFlowParams(BaseModel): capacity: Optional[float] = None "amount of capacity allowed on each edge" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ MinCostFlow deals with fully directed graphs diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 013eced8d..1f33c25f7 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -77,7 +77,7 @@ class OmicsIntegrator1Params(BaseModel): r: Optional[float] = None "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) class OmicsIntegrator1(PRM[OmicsIntegrator1Params]): """ diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index d92ba77d2..aef4f3c48 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -49,7 +49,7 @@ class OmicsIntegrator2Params(BaseModel): seed: int = Field(default_factory=lambda _: int(time.time() * 1000)) "The random seed to use for this run. Defaults to the current UNIX timestamp." - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ Omics Integrator 2 will construct a fully undirected graph from the provided input file diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 9b6fe964c..da0a91ba2 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -19,7 +19,7 @@ class PathLinkerParams(BaseModel): k: int = 100 "path length" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ Pathlinker will construct a fully directed graph from the provided input file diff --git a/spras/rwr.py b/spras/rwr.py index ba78589ec..dff5bdb97 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -19,7 +19,7 @@ class RWRParams(BaseModel): alpha: Optional[float] = None "The chance of a restart during the random walk" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) class RWR(PRM[RWRParams]): required_inputs = ['network','nodes'] diff --git a/spras/strwr.py b/spras/strwr.py index 37590e7c6..1b9159eff 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -19,7 +19,7 @@ class ST_RWRParams(BaseModel): alpha: Optional[float] = None "The chance of a restart during the random walk" - model_config = ConfigDict(use_attribute_docstrings=True) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) # Note: This class is almost identical to the rwr.py file. class ST_RWR(PRM[ST_RWRParams]): From b07a7ef0f1eba21609f0eb87bffc603a4199723c Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 20:04:32 +0000 Subject: [PATCH 21/68] style: fmt --- spras/config/algorithms.py | 13 +++++++------ spras/config/container_schema.py | 7 ++++--- spras/config/schema.py | 1 + spras/config/util.py | 2 +- spras/domino.py | 2 +- spras/omicsintegrator1.py | 2 +- spras/runner.py | 2 +- test/test_config.py | 2 +- 8 files changed, 17 insertions(+), 14 deletions(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 32f6b82d3..8c49c2ae2 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -4,11 +4,12 @@ and rather mainly contains validators and lower-level pydantic code. """ import ast -from typing import Annotated, Any, Callable, cast, get_args, Optional, Union, Literal +from typing import Annotated, Any, Callable, Literal, Optional, Union, cast, get_args import numpy as np +from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, create_model + from spras.runner import algorithms -from pydantic import BaseModel, BeforeValidator, create_model, ConfigDict, Field __all__ = ['AlgorithmUnion'] @@ -33,7 +34,7 @@ def python_evalish_coerce(value: Any) -> Any: if not isinstance(value, str): return value - + # These strings are in the form of function calls `function.name(param1, param2, ...)`. # Since we want to avoid `eval` (since this might be running in the secret-sensitive HTCondor), # we need to parse these functions. @@ -54,7 +55,7 @@ def python_evalish_coerce(value: Any) -> Any: # This should always be an Expression whose body is Call (a function). if not isinstance(value_ast.body, ast.Call): raise ValueError(f'The python code "{value}" should be calling a function directly. Is this meant to be python code?') - + # We get the function name back as a string function_name = ast.unparse(value_ast.body.func) @@ -63,7 +64,7 @@ def python_evalish_coerce(value: Any) -> Any: if function_name not in functions_dict: raise ValueError(f"{function_name} is not an allowed function to be run!") - + return functions_dict[function_name](arguments) def list_coerce(value: Any) -> Any: @@ -126,7 +127,7 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: __config__=ConfigDict(extra='forbid'), **mapped_list_field ) - + # Here is an example of how this would look like inside config.yaml # name: pathlinker # include: true diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py index ea9881a30..c88692678 100644 --- a/spras/config/container_schema.py +++ b/spras/config/container_schema.py @@ -6,10 +6,11 @@ this subsection of the configuration. """ +import warnings from dataclasses import dataclass -from pydantic import BaseModel, ConfigDict, Field from typing import Optional -import warnings + +from pydantic import BaseModel, ConfigDict, Field from spras.config.util import CaseInsensitiveEnum @@ -60,7 +61,7 @@ def from_container_settings(settings: ContainerSettings, default_hash_length: in container_prefix = DEFAULT_CONTAINER_PREFIX if settings.registry and settings.registry.base_url != "" and settings.registry.owner != "": container_prefix = settings.registry.base_url + "/" + settings.registry.owner - + return ProcessedContainerSettings( framework=container_framework, unpack_singularity=unpack_singularity, diff --git a/spras/config/schema.py b/spras/config/schema.py index fc502b677..b2ff0b3bd 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -19,6 +19,7 @@ from spras.config.container_schema import ContainerSettings from spras.config.util import CaseInsensitiveEnum + class SummaryAnalysis(BaseModel): include: bool diff --git a/spras/config/util.py b/spras/config/util.py index 0ed99a26e..63799e478 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -4,8 +4,8 @@ only import this config file. """ -from enum import Enum import time +from enum import Enum from typing import Any from pydantic import BaseModel, ConfigDict, Field diff --git a/spras/domino.py b/spras/domino.py index 521f89722..a45a445a2 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -5,8 +5,8 @@ import pandas as pd from pydantic import ConfigDict -from spras.containers import prepare_volume, run_container_and_log from spras.config.util import NondeterministicModel +from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( add_constant, reinsert_direction_col_undirected, diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 1f33c25f7..d9ee603fb 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,5 +1,5 @@ -from pathlib import Path import time +from pathlib import Path from typing import Optional from pydantic import BaseModel, ConfigDict, Field diff --git a/spras/runner.py b/spras/runner.py index 4f603f9b9..209a32f42 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -5,8 +5,8 @@ # supported algorithm imports from spras.allpairs import AllPairs from spras.btb import BowTieBuilder -from spras.dataset import Dataset from spras.config.util import Empty +from spras.dataset import Dataset from spras.domino import DOMINO, DominoParams from spras.meo import MEO, MEOParams from spras.mincostflow import MinCostFlow, MinCostFlowParams diff --git a/test/test_config.py b/test/test_config.py index b0031d029..71842c2e1 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -4,8 +4,8 @@ import pytest import spras.config.config as config -from spras.config.schema import DEFAULT_HASH_LENGTH from spras.config.container_schema import DEFAULT_CONTAINER_PREFIX +from spras.config.schema import DEFAULT_HASH_LENGTH filler_dataset_data: dict[str, str | list[str]] = { "data_dir": "fake", From 0bcd1d15ae03e5cfb4b1a0398d64585b713bb7b5 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 20:29:01 +0000 Subject: [PATCH 22/68] fix: coerce fields to validate default --- spras/config/algorithms.py | 16 +++++++++++----- test/test_config.py | 6 +++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 8c49c2ae2..c65ddae8a 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -4,6 +4,7 @@ and rather mainly contains validators and lower-level pydantic code. """ import ast +import copy from typing import Annotated, Any, Callable, Literal, Optional, Union, cast, get_args import numpy as np @@ -98,8 +99,14 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: # Map our fields to a list (assuming we have no nested keys), # and specify our user convenience validators - mapped_list_field: dict[str, Annotated] = { - name: (Annotated[ + mapped_list_field: dict[str, Annotated] = dict() + for field_name, field in model.model_fields.items(): + # We need to create a copy of the field, + # as we need to make sure that it gets mapped to the list coerced version of the field. + new_field = copy.deepcopy(field) + new_field.validate_default = True + + mapped_list_field[field_name] = (Annotated[ list[field.annotation], # This order isn't arbitrary. # https://docs.pydantic.dev/latest/concepts/validators/#ordering-of-validators @@ -111,9 +118,8 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: # json_schema_input_type (sensibly) overwrites, so we have to specify the entire union again here. json_schema_input_type=Union[field.annotation, list[field.annotation], str] ) if is_numpy_friendly(field.annotation) else None - ], field) for name, field in model.model_fields.items() - } - + ], new_field) + # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields for key in mapped_list_field.keys(): assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema. " + \ diff --git a/test/test_config.py b/test/test_config.py index 71842c2e1..e38272f94 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -61,10 +61,10 @@ def get_test_config(): "name": "omicsintegrator2", "include": True, "runs": { - "strings": {"dummyMode": ["terminals", "others"], "b": 1}, + "strings": {"dummy_mode": ["terminals", "others"], "b": 1}, # spacing in np.linspace is on purpose - "singleton_string_np_linspace": {"dummyMode": "terminals", "b": "np.linspace(0, 5,2)"}, - "str_array_np_logspace": {"test": ["others", "all"], "g": "np.logspace(1,1)"} + "singleton_string_np_linspace": {"dummy_mode": "terminals", "b": "np.linspace(0, 5,2)"}, + "str_array_np_logspace": {"dummy_mode": ["others", "all"], "g": "np.logspace(1,1)"} } }, { From 1cb5d179a876517f15224aa18aba7e7e719cc9de Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 21:55:57 +0000 Subject: [PATCH 23/68] fix: test --- config/config.yaml | 3 +- config/egfr.yaml | 57 ++++++------------ config/schema.json | 4 +- spras/config/algorithms.py | 7 ++- spras/config/config.py | 7 ++- test/test_config.py | 120 +++++++++++++++++++++++++++++-------- 6 files changed, 126 insertions(+), 72 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 49ae31f4f..30b438390 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -96,8 +96,7 @@ algorithms: - name: "domino" include: true - params: - include: true + runs: run1: slice_threshold: 0.3 module_threshold: 0.05 diff --git a/config/egfr.yaml b/config/egfr.yaml index 106963c62..363d213a1 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -25,39 +25,28 @@ algorithms: - 0.55 - 2 - 10 - d: - - 10 - g: - - 1e-3 - r: - - 0.01 - w: - - 0.1 - mu: - - 0.008 + d: 10 + g: 1e-3 + r: 0.01 + w: 0.1 + mu: 0.008 dummy_mode: ["file"] - name: omicsintegrator2 include: true runs: run1: - b: - - 4 - g: - - 0 + b: 4 + g: 0 run2: - b: - - 2 - g: - - 3 + b: 2 + g: 3 - name: meo include: true runs: run1: local_search: true - max_path_length: - - 3 - rand_restarts: - - 10 + max_path_length: 3 + rand_restarts: 10 run2: local_search: false max_path_length: 2 @@ -68,28 +57,20 @@ algorithms: include: true runs: run1: - slice_threshold: - - 0.3 - module_threshold: - - 0.05 + slice_threshold: 0.3 + module_threshold: 0.05 - name: mincostflow include: true runs: run1: - capacity: - - 15 - flow: - - 80 + capacity: 15 + flow: 80 run2: - capacity: - - 1 - flow: - - 6 + capacity: 1 + flow: 6 run3: - capacity: - - 5 - flow: - - 60 + capacity: 5 + flow: 60 datasets: - data_dir: input edge_files: diff --git a/config/schema.json b/config/schema.json index c15dcaa8a..494736275 100644 --- a/config/schema.json +++ b/config/schema.json @@ -399,7 +399,7 @@ }, "default": { "default": { - "_time": 1752606304.38952, + "_time": 1752611437.804319, "module_threshold": null, "slice_threshold": null } @@ -1086,7 +1086,7 @@ "noisy_edges": null, "random_terminals": null, "dummy_mode": null, - "seed": 1752606304389 + "seed": 1752611437804 } }, "title": "Runs", diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index c65ddae8a..889efab35 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -47,7 +47,8 @@ def python_evalish_coerce(value: Any) -> Any: } # To do this, we get the AST of our string as an expression - value_ast = ast.parse(value, mode='eval', filename='config.yaml') + # (filename='' is to make the error message more closely resemble that of eval.) + value_ast = ast.parse(value, mode='eval', filename='') # Then we do some light parsing - we're only looking to do some literal evaluation # (allowing light python notation) and some basic function parsing. Full python programs @@ -60,7 +61,7 @@ def python_evalish_coerce(value: Any) -> Any: # We get the function name back as a string function_name = ast.unparse(value_ast.body.func) - # and we use the (non-availability) safe `ast.literal_eval` to support light expressions. + # and we use the (non-availability) safe `ast.literal_eval` to support literals passed into functions. arguments = [ast.literal_eval(arg) for arg in value_ast.body.args] if function_name not in functions_dict: @@ -119,7 +120,7 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: json_schema_input_type=Union[field.annotation, list[field.annotation], str] ) if is_numpy_friendly(field.annotation) else None ], new_field) - + # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields for key in mapped_list_field.keys(): assert not key.startswith("__"), f"A private key has been passed from {name}'s argument schema. " + \ diff --git a/spras/config/config.py b/spras/config/config.py index 6eeb760a7..2c0499fb7 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -79,7 +79,7 @@ def __init__(self, raw_config: dict[str, Any]): self.algorithms = None # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. # Only includes algorithms that are set to be run with 'include: true'. - self.algorithm_params = None + self.algorithm_params: dict[str, dict[str, Any]] = dict() # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs. self.algorithm_directed = None # A dict with the analysis settings @@ -196,6 +196,11 @@ def process_algorithms(self, raw_config: RawConfig): if params_hash in prior_params_hashes: raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file ' f'(current length {self.hash_length}).') + + # We preserve the run name as it carries useful information for the parameter log, + # and is useful for testing. + run_dict["_spras_run_name"] = run_name + self.algorithm_params[alg.name][params_hash] = run_dict def process_analysis(self, raw_config: RawConfig): diff --git a/test/test_config.py b/test/test_config.py index e38272f94..3d8d67d78 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -1,11 +1,17 @@ +import copy import pickle +from typing import Iterable import numpy as np import pytest +from pydantic import BaseModel import spras.config.config as config from spras.config.container_schema import DEFAULT_CONTAINER_PREFIX from spras.config.schema import DEFAULT_HASH_LENGTH +from spras.meo import MEOParams +from spras.mincostflow import MinCostFlowParams +from spras.omicsintegrator2 import DummyMode, OmicsIntegrator2Params filler_dataset_data: dict[str, str | list[str]] = { "data_dir": "fake", @@ -61,9 +67,9 @@ def get_test_config(): "name": "omicsintegrator2", "include": True, "runs": { - "strings": {"dummy_mode": ["terminals", "others"], "b": 1}, + "strings": {"dummy_mode": ["terminals", "others"], "b": 3}, # spacing in np.linspace is on purpose - "singleton_string_np_linspace": {"dummy_mode": "terminals", "b": "np.linspace(0, 5,2)"}, + "singleton_string_np_linspace": {"dummy_mode": "terminals", "b": "np.linspace(0, 5,2,)"}, "str_array_np_logspace": {"dummy_mode": ["others", "all"], "g": "np.logspace(1,1)"} } }, @@ -71,7 +77,8 @@ def get_test_config(): "name": "meo", "include": True, "runs": { - "numbersAndBool": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": True}, + "numbersAndBoolsDuplicate": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, + "numbersAndBool": {"max_path_length": 2, "rand_restarts": [float(2.0), 3], "local_search": [True]}, "numbersAndBools": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, "boolArrTest": {"local_search": [True, False], "max_path_length": "range(1, 3)"} } @@ -80,7 +87,7 @@ def get_test_config(): "name": "mincostflow", "include": True, "runs": { - "int64artifact": {"flow": "np.arange(5,6)", "capacity": [2, 3]} + "int64artifact": {"flow": "np.arange(5, 7)", "capacity": [2, 3]} } }, ], @@ -104,22 +111,49 @@ def get_test_config(): return test_raw_config -def value_test_util(name: str, configurations: list): - assert name in config.config.algorithm_params, f"{name} isn't a present algorithm configuration!" - - keys = config.config.algorithm_params[name] - values = [config.config.algorithm_params[name][key] for key in keys] +def value_test_util(alg: str, run_name: str, param_type: type[BaseModel], configurations: Iterable[BaseModel]): + """ + Utility test function to be able to test against certain named runs + under algorithms. This is, unfortunately, a very holistic function that depends + on the current state of how config parsing is. + """ + assert alg in config.config.algorithm_params, f"{alg} isn't a present algorithm name!" + runs = config.config.algorithm_params[alg] + # Filter using the internal _spras_run_name key. + runs = {hash: params for hash, params in runs.items() if params["_spras_run_name"] == run_name} + + # We copy values so we don't mutate it + values: list[dict] = copy.deepcopy(list(runs.values())) + for value in values: + # then, remove the internal key for easy comparison. + del value["_spras_run_name"] + + # Since configurations is a bunch of objects, we need to turn those into dictionaries + # and exclude their defaults. + new_configurations = [config.model_dump(exclude_defaults=True) for config in configurations] + + # Same for values, but we reserialize them first + values = [param_type.model_validate(value).model_dump(exclude_defaults=True) for value in values] + + # Now, we need to also remove any dynamic values from values and configurations + # (_time and seeded values) + for value in values: + value.pop("_time", None) + value.pop("seed", None) + for configuration in new_configurations: + configuration.pop("_time", None) + configuration.pop("seed", None) # https://stackoverflow.com/a/50486270/7589775 # Note: We use pickle as we also compare dictionaries in these two sets - some kind of consistent total ordering # is required for the tests to consistently pass when comparing them to `configurations`. - set_values = set(tuple(sorted(d.items())) for d in sorted(values, key=lambda x: pickle.dumps(x, protocol=3))) - set_configurations = set(tuple(sorted(d.items())) for d in sorted(configurations, key=lambda x: pickle.dumps(x, protocol=3))) + final_values = sorted(tuple(sorted(d.items())) for d in sorted(values, key=lambda x: pickle.dumps(x, protocol=3))) + final_configurations = sorted(tuple(sorted(d.items())) for d in sorted(new_configurations, key=lambda x: pickle.dumps(x, protocol=3))) - if set_values != set_configurations: - print(f'Got: {set_values}') - print(f'Expected: {set_configurations}') - assert set_values == set_configurations + if final_values != final_configurations: + print(f'Got: {final_values}') + print(f'Expected: {final_configurations}') + assert final_values == final_configurations class TestConfig: """ @@ -225,17 +259,51 @@ def test_config_values(self): test_config = get_test_config() config.init_global(test_config) - value_test_util('strings', [{'test': "str1", 'test2': "str2"}, {'test': 'str1', 'test2': 'str3'}]) - value_test_util('numbersAndBools', [{'a': 1, 'b': float(2.0), 'c': 4, 'd': 5.6, 'f': False}, {'a': 1, 'b': 3, 'c': 4, 'd': 5.6, 'f': False}]) - - value_test_util('singleton_int64_with_array', [{'test': 1, 'test2': 2}, {'test': 1, 'test2': 3}]) - value_test_util('singleton_string_np_linspace', [{'test': "str1", 'test2': 5.0}, {'test': "str1", 'test2': 0.0}]) - value_test_util('str_array_np_logspace', [{'test': "a", 'test2': 10}] * 10 + [{'test': "b", 'test2': 10}] * 10) - - value_test_util('int64artifact', [{'test': 5, 'test2': 2}, {'test': 5, 'test2': 3}]) - - value_test_util('boolArrTest', [{'flags': True, 'range': 1}, {'flags': False, 'range': 2}, - {'flags': False, 'range': 1}, {'flags': True, 'range': 2}]) + value_test_util('omicsintegrator2', 'strings', OmicsIntegrator2Params, [ + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=3), + OmicsIntegrator2Params(dummy_mode=DummyMode.others, b=3) + ]) + + value_test_util('omicsintegrator2', 'singleton_string_np_linspace', OmicsIntegrator2Params, [ + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=5.0), + OmicsIntegrator2Params(dummy_mode=DummyMode.terminals, b=0.0) + ]) + + value_test_util('omicsintegrator2', 'str_array_np_logspace', OmicsIntegrator2Params, [ + # While these both repeat 50 times, parameter hash makes sure to not duplicate the work. + # This serves as a test to make sure _time isn't inserted during parameter combinations. + OmicsIntegrator2Params(dummy_mode=DummyMode.others, g=10), OmicsIntegrator2Params(dummy_mode=DummyMode.all, g=10) + ]) + + value_test_util('meo', 'numbersAndBools', MEOParams, [ + MEOParams(max_path_length=1, rand_restarts=2, local_search=False), + MEOParams(max_path_length=1, rand_restarts=2, local_search=True), + MEOParams(max_path_length=1, rand_restarts=3, local_search=False), + MEOParams(max_path_length=1, rand_restarts=3, local_search=True), + ]) + + # Encoding this behavior: run names are not passed into the parameter hash, + # and thus won't duplicate runs. + value_test_util('meo', 'numbersAndBoolsDuplicate', MEOParams, []) + + value_test_util('meo', 'numbersAndBool', MEOParams, [ + MEOParams(max_path_length=2, rand_restarts=2, local_search=True), + MEOParams(max_path_length=2, rand_restarts=3, local_search=True), + ]) + + value_test_util('mincostflow', 'int64artifact', MinCostFlowParams, [ + MinCostFlowParams(flow=5, capacity=2), + MinCostFlowParams(flow=5, capacity=3), + MinCostFlowParams(flow=6, capacity=2), + MinCostFlowParams(flow=6, capacity=3) + ]) + + value_test_util('meo', 'boolArrTest', MEOParams, [ + MEOParams(local_search=True, max_path_length=1), + MEOParams(local_search=True, max_path_length=2), + MEOParams(local_search=False, max_path_length=1), + MEOParams(local_search=False, max_path_length=2) + ]) @pytest.mark.parametrize("ml_include, eval_include, expected_ml, expected_eval", [ (True, True, True, True), From c93244ff32dddff416d7f21c838b28fef4ed9cc9 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 22:11:56 +0000 Subject: [PATCH 24/68] fix: correct all algorithm usage --- spras/allpairs.py | 3 ++- spras/btb.py | 3 ++- spras/domino.py | 23 ++++++++++++----------- spras/meo.py | 26 ++++++++++++++------------ spras/mincostflow.py | 19 ++++++++++--------- spras/omicsintegrator1.py | 16 +++++++++------- spras/omicsintegrator2.py | 17 +++++++++-------- spras/pathlinker.py | 17 +++++++++-------- spras/rwr.py | 16 +++++++++------- spras/strwr.py | 18 ++++++++++-------- 10 files changed, 86 insertions(+), 72 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index bba5df467..5c1476e8a 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -73,7 +73,8 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, output_file, args=None, container_settings=ProcessedContainerSettings()): + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() if not inputs["nodetypes"] or not inputs["network"] or not inputs["directed_flag"]: raise ValueError('Required All Pairs Shortest Paths arguments are missing') diff --git a/spras/btb.py b/spras/btb.py index 7f7a1b944..16bce75ae 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -66,7 +66,8 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, output_file, args=None, container_settings=ProcessedContainerSettings()): + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide diff --git a/spras/domino.py b/spras/domino.py index a45a445a2..d3d761e1f 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -5,6 +5,7 @@ import pandas as pd from pydantic import ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import NondeterministicModel from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -77,9 +78,9 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): - if not args: - args = DominoParams() + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = DominoParams() # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. if not inputs["network"] or not inputs["active_genes"]: @@ -90,19 +91,19 @@ def run(inputs, output_file, args=None, container_framework="docker"): # Each volume is a tuple (source, destination) volumes = list() - bind_path, network_file = prepare_volume(inputs["network"], work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) - bind_path, node_file = prepare_volume(inputs["active_genes"], work_dir) + bind_path, node_file = prepare_volume(inputs["active_genes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) slices_file = Path(out_dir, 'slices.txt') - bind_path, mapped_slices_file = prepare_volume(str(slices_file), work_dir) + bind_path, mapped_slices_file = prepare_volume(str(slices_file), work_dir, container_settings) volumes.append(bind_path) # Make the Python command to run within the container @@ -112,11 +113,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): container_suffix = "domino" run_container_and_log('slicer', - container_framework, container_suffix, slicer_command, volumes, - work_dir) + work_dir, + container_settings) # Make the Python command to run within the container domino_command = ['domino', @@ -136,11 +137,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): domino_command.extend(['--module_threshold', str(args.module_threshold)]) run_container_and_log('DOMINO', - container_framework, container_suffix, domino_command, volumes, - work_dir) + work_dir, + container_settings) # DOMINO creates a new folder in out_dir to output its modules HTML files into called active_genes # The filename is determined by the input active_genes and cannot be configured diff --git a/spras/meo.py b/spras/meo.py index 4b3f9299e..b3b8a5973 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( add_directionality_constant, @@ -145,7 +146,7 @@ def generate_inputs(data, filename_map): # TODO add parameter validation # TODO document required arguments @staticmethod - def run(inputs, output_file=None, args=None, container_framework="docker"): + def run(inputs, output_file=None, args=None, container_settings=None): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -154,8 +155,8 @@ def run(inputs, output_file=None, args=None, container_framework="docker"): Only the edge output file is retained. All other output files are deleted. """ - if not args: - args = MEOParams() + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = MEOParams() if inputs["edges"] is None or inputs["sources"] is None or inputs["targets"] is None: raise ValueError('Required Maximum Edge Orientation arguments are missing') @@ -165,44 +166,45 @@ def run(inputs, output_file=None, args=None, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, source_file = prepare_volume(inputs["sources"], work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(inputs["targets"], work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent # Maximum Edge Orientation requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_output_file = prepare_volume(str(output_file), work_dir) + bind_path, mapped_output_file = prepare_volume(str(output_file), work_dir, container_settings) volumes.append(bind_path) # Hard code the path output filename, which will be deleted path_output_file = Path(out_dir, 'path-output.txt') - bind_path, mapped_path_output = prepare_volume(str(path_output_file), work_dir) + bind_path, mapped_path_output = prepare_volume(str(path_output_file), work_dir, container_settings) volumes.append(bind_path) properties_file = 'meo-properties.txt' properties_file_local = Path(out_dir, properties_file) write_properties(filename=properties_file_local, edges=edge_file, sources=source_file, targets=target_file, edge_output=mapped_output_file, path_output=mapped_path_output, - max_path_length=args.max_path_length, local_search=args.local_search, rand_restarts=args.rand_restarts, framework=container_framework) - bind_path, properties_file = prepare_volume(str(properties_file_local), work_dir) + max_path_length=args.max_path_length, local_search=args.local_search, rand_restarts=args.rand_restarts, + framework=container_settings.framework) + bind_path, properties_file = prepare_volume(str(properties_file_local), work_dir, container_settings) volumes.append(bind_path) command = ['java', '-jar', '/meo/EOMain.jar', properties_file] container_suffix = "meo" run_container_and_log('Maximum Edge Orientation', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) properties_file_local.unlink(missing_ok=True) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 1f7ff0cf7..05dd22bf5 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, @@ -72,9 +73,9 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): - if not args: - args = MinCostFlowParams() + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = MinCostFlowParams() # ensures that these parameters are required if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: @@ -86,19 +87,19 @@ def run(inputs, output_file, args=None, container_framework="docker"): # the tuple is for mapping the sources, targets, edges, and output volumes = list() - bind_path, sources_file = prepare_volume(inputs["sources"], work_dir) + bind_path, sources_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, targets_file = prepare_volume(inputs["targets"], work_dir) + bind_path, targets_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(inputs["edges"], work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) # Create a prefix for the output filename and ensure the directory exists out_dir = Path(output_file).parent out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + '/out' @@ -121,11 +122,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): # constructs a docker run call run_container_and_log('MinCostFlow', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # Check the output of the container out_dir_content = sorted(out_dir.glob('*.sif')) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index d9ee603fb..9d1396902 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, ConfigDict, Field +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed from spras.prm import PRM @@ -142,7 +143,8 @@ def generate_inputs(data, filename_map): # TODO add support for knockout argument # TODO add reasonable default values @staticmethod - def run(inputs, output_file, args, container_framework="docker"): + def run(inputs, output_file, args, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() if inputs["edges"] is None or inputs["prizes"] is None or output_file is None: raise ValueError('Required Omics Integrator 1 arguments are missing') @@ -151,10 +153,10 @@ def run(inputs, output_file, args, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir, container_settings) volumes.append(bind_path) # 4 dummy mode possibilities: @@ -167,13 +169,13 @@ def run(inputs, output_file, args, container_framework="docker"): if args.dummy_mode == 'file': if inputs["dummy_nodes"] is None: raise ValueError("dummy_nodes file is required when dummy_mode is set to 'file'") - bind_path, dummy_file = prepare_volume(inputs["dummy_nodes"], work_dir) + bind_path, dummy_file = prepare_volume(inputs["dummy_nodes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent # Omics Integrator 1 requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) conf_file = 'oi1-configuration.txt' @@ -181,7 +183,7 @@ def run(inputs, output_file, args, container_framework="docker"): # Temporary file that will be deleted after running Omics Integrator 1 write_conf(conf_file_local, w=args.w, b=args.b, d=args.d, mu=args.mu, noise=args.noise, g=args.g, r=args.r) - bind_path, conf_file = prepare_volume(str(conf_file_local), work_dir) + bind_path, conf_file = prepare_volume(str(conf_file_local), work_dir, container_settings) volumes.append(bind_path) command = ['python', '/OmicsIntegrator/scripts/forest.py', @@ -213,11 +215,11 @@ def run(inputs, output_file, args, container_framework="docker"): container_suffix = "omics-integrator-1:no-conda" # no-conda version is the default run_container_and_log('Omics Integrator 1', - container_framework, container_suffix, # no-conda version is the default command, volumes, work_dir, + container_settings, {'TMPDIR': mapped_out_dir}) conf_file_local.unlink(missing_ok=True) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index aef4f3c48..8b5c29799 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -5,6 +5,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict, Field +from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -110,7 +111,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO add reasonable default values # TODO document required arguments @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): + def run(inputs, output_file, args=None, container_settings=None): """ Run Omics Integrator 2 in the Docker image with the provided parameters. Only the .tsv output file is retained and then renamed. @@ -118,8 +119,8 @@ def run(inputs, output_file, args=None, container_framework="docker"): @param output_file: the name of the output file, which will overwrite any existing file with this name @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) """ - if not args: - args = OmicsIntegrator2Params() + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = OmicsIntegrator2Params() if inputs["edges"] is None or inputs["prizes"] is None: raise ValueError('Required Omics Integrator 2 arguments are missing') @@ -129,16 +130,16 @@ def run(inputs, output_file, args=None, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, edge_file = prepare_volume(inputs["edges"], work_dir) + bind_path, edge_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) - bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir) + bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir, container_settings) volumes.append(bind_path) out_dir = Path(output_file).parent # Omics Integrator 2 requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(out_dir, work_dir) + bind_path, mapped_out_dir = prepare_volume(out_dir, work_dir, container_settings) volumes.append(bind_path) command = ['OmicsIntegrator', '-e', edge_file, '-p', prize_file, @@ -164,11 +165,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): container_suffix = "omics-integrator-2:v2" run_container_and_log('Omics Integrator 2', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # TODO do we want to retain other output files? # TODO if deleting other output files, write them all to a tmp directory and copy diff --git a/spras/pathlinker.py b/spras/pathlinker.py index da0a91ba2..f71015f0e 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( @@ -76,9 +77,9 @@ def generate_inputs(data, filename_map): header=["#Interactor1","Interactor2","Weight"]) @staticmethod - def run(inputs, output_file, args=None, container_framework="docker"): - if not args: - args = PathLinkerParams() + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() + if not args: args = PathLinkerParams() if not inputs["nodetypes"] or not inputs["network"]: raise ValueError('Required PathLinker arguments are missing') @@ -88,10 +89,10 @@ def run(inputs, output_file, args=None, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir) + bind_path, node_file = prepare_volume(inputs["nodetypes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(inputs["network"], work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # PathLinker does not provide an argument to set the output directory @@ -99,7 +100,7 @@ def run(inputs, output_file, args=None, container_framework="docker"): out_dir = Path(output_file).parent # PathLinker requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + '/out' # Use posix path inside the container @@ -113,11 +114,11 @@ def run(inputs, output_file, args=None, container_framework="docker"): container_suffix = "pathlinker:v2" run_container_and_log('PathLinker', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # Rename the primary output file to match the desired output filename # Currently PathLinker only writes one output file so we do not need to delete others diff --git a/spras/rwr.py b/spras/rwr.py index dff5bdb97..a46e734e6 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -4,6 +4,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_directed @@ -45,7 +46,8 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, output_file, args, container_framework="docker"): + def run(inputs, output_file, args, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() if not inputs["nodes"] or not inputs["network"]: raise ValueError('Required RWR arguments are missing') @@ -60,10 +62,10 @@ def run(inputs, output_file, args, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, nodes_file = prepare_volume(inputs["nodes"], work_dir) + bind_path, nodes_file = prepare_volume(inputs["nodes"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(inputs["network"], work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # RWR does not provide an argument to set the output directory @@ -71,7 +73,7 @@ def run(inputs, output_file, args, container_framework="docker"): out_dir = Path(output_file).parent # RWR requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + "/output.txt" command = ['python', @@ -85,11 +87,11 @@ def run(inputs, output_file, args, container_framework="docker"): command.extend(['--alpha', str(args.alpha)]) container_suffix = 'rwr:v1' - out = run_container(container_framework, - container_suffix, + out = run_container(container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) print(out) # Rename the primary output file to match the desired output filename diff --git a/spras/strwr.py b/spras/strwr.py index 1b9159eff..28a76099e 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -4,6 +4,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_directed @@ -47,7 +48,8 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, output_file, args, container_framework="docker"): + def run(inputs, output_file, args, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() if not inputs["sources"] or not inputs["targets"] or not inputs["network"] or not output_file: raise ValueError('Required local_neighborhood arguments are missing') @@ -63,13 +65,13 @@ def run(inputs, output_file, args, container_framework="docker"): # Each volume is a tuple (src, dest) volumes = list() - bind_path, source_file = prepare_volume(inputs["sources"], work_dir) + bind_path, source_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, target_file = prepare_volume(inputs["targets"], work_dir) + bind_path, target_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, network_file = prepare_volume(inputs["network"], work_dir) + bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings) volumes.append(bind_path) # ST_RWR does not provide an argument to set the output directory @@ -77,7 +79,7 @@ def run(inputs, output_file, args, container_framework="docker"): out_dir = Path(output_file).parent # ST_RWR requires that the output directory exist out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = mapped_out_dir + "/output.txt" command = ['python', @@ -92,11 +94,11 @@ def run(inputs, output_file, args, container_framework="docker"): command.extend(['--alpha', str(args.alpha)]) container_suffix = 'st-rwr:v1' - out = run_container(container_framework, - container_suffix, + out = run_container(container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) print(out) # Rename the primary output file to match the desired output filename From 69268f4ca83ee6d9977f12d0124e613df67e0ab1 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 23:10:59 +0000 Subject: [PATCH 25/68] chore: talk about resumability --- Snakefile | 2 +- spras/config/schema.py | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index 34681cb02..358a83f42 100644 --- a/Snakefile +++ b/Snakefile @@ -25,7 +25,7 @@ algorithm_params = _config.config.algorithm_params algorithm_directed = _config.config.algorithm_directed pca_params = _config.config.pca_params hac_params = _config.config.hac_params -FRAMEWORK = _config.config.container_framework +FRAMEWORK = _config.config.container_settings.framework # Return the dataset or gold_standard dictionary from the config file given the label def get_dataset(_datasets, label): diff --git a/spras/config/schema.py b/spras/config/schema.py index b2ff0b3bd..49100bef3 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -117,11 +117,27 @@ class ReconstructionSettings(BaseModel): model_config = ConfigDict(extra='forbid') class RawConfig(BaseModel): + resume: bool = Field(alias="_resume", default=False) + """ + Declares whether a config has resumability. This is meant to be used internally, as it + enforces some extra preconditions on the config (such that all defaults must be explicitly + declared within the config, and that it meets the specified hash). + + Unlike their nonresumable counterparts, these resumable configurations will store all configuration + defaults (including, most importantly, _time from NondeterministicModel and any seeded values). + + Resumable configurations are generated whenever a non-resumable configuration is run, inside + `{output}/resumables/{hash}.yaml`. The timestamp is present only for file ordering, and {hash} is a hash + of the configuration _excluding_ default values. + + By default, SPRAS runs through Snakemake will generate a resumable configuration if none is present, + or reuse the configuration associated with its hash otherwise. + """ + containers: ContainerSettings - hash_length: int = Field( - description="The length of the hash used to identify a parameter combination", - default=DEFAULT_HASH_LENGTH) + hash_length: int = DEFAULT_HASH_LENGTH + "The length of the hash used to identify a parameter combination" # See algorithms.py for more information about AlgorithmUnion algorithms: list[AlgorithmUnion] # type: ignore - pydantic allows this. @@ -131,4 +147,4 @@ class RawConfig(BaseModel): reconstruction_settings: ReconstructionSettings - model_config = ConfigDict(extra='forbid') + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) From 5ce01dbd225ea50859d3feb7a5a5f458d7870e4b Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 17 Jul 2025 22:37:17 +0000 Subject: [PATCH 26/68] chore: drop seeds changes --- Snakefile | 16 +++++-------- spras/config/schema.py | 17 ------------- spras/config/util.py | 18 -------------- spras/domino.py | 6 ++--- spras/omicsintegrator1.py | 9 +++---- spras/omicsintegrator2.py | 7 +++--- spras/prm.py | 24 ++++++++++++++++++- spras/runner.py | 6 +++-- .../expected_output/test_example_summary.txt | 2 +- 9 files changed, 46 insertions(+), 59 deletions(-) diff --git a/Snakefile b/Snakefile index 358a83f42..c5c19c4fa 100644 --- a/Snakefile +++ b/Snakefile @@ -254,16 +254,12 @@ rule reconstruct: run: # Create a copy so that the updates are not written to the parameters logfile params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() - # Add the input files - params.update(dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True))) - # Add the output file - # All run functions can accept a relative path to the output file that should be written that is called 'output_file' - params['output_file'] = output.pathway_file - # Remove the default placeholder parameter added for algorithms that have no parameters - if 'spras_placeholder' in params: - params.pop('spras_placeholder') - params['container_framework'] = FRAMEWORK - runner.run(wildcards.algorithm, params) + # Declare the input files as a dictionary. + inputs = dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True)) + # Remove the _spras_run_name parameter added for keeping track of the run name for parameters.yml + if '_spras_run_name' in params: + params.pop('_spras_run_name') + runner.run(wildcards.algorithm, inputs, output.pathway_file, params, _config.config.container_settings) # Original pathway reconstruction output to universal output # Use PRRunner as a wrapper to call the algorithm-specific parse_output diff --git a/spras/config/schema.py b/spras/config/schema.py index 49100bef3..34549c61d 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -117,23 +117,6 @@ class ReconstructionSettings(BaseModel): model_config = ConfigDict(extra='forbid') class RawConfig(BaseModel): - resume: bool = Field(alias="_resume", default=False) - """ - Declares whether a config has resumability. This is meant to be used internally, as it - enforces some extra preconditions on the config (such that all defaults must be explicitly - declared within the config, and that it meets the specified hash). - - Unlike their nonresumable counterparts, these resumable configurations will store all configuration - defaults (including, most importantly, _time from NondeterministicModel and any seeded values). - - Resumable configurations are generated whenever a non-resumable configuration is run, inside - `{output}/resumables/{hash}.yaml`. The timestamp is present only for file ordering, and {hash} is a hash - of the configuration _excluding_ default values. - - By default, SPRAS runs through Snakemake will generate a resumable configuration if none is present, - or reuse the configuration associated with its hash otherwise. - """ - containers: ContainerSettings hash_length: int = DEFAULT_HASH_LENGTH diff --git a/spras/config/util.py b/spras/config/util.py index 63799e478..eeac6faf4 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -34,21 +34,3 @@ class Empty(BaseModel): yet are deterministic. """ model_config = ConfigDict(extra="forbid") - -class NondeterministicModel(BaseModel): - """ - A nondeterministic model. Any seedless nondeterministic algorithm should extend this. - Internally, this inserts a _time parameter that can be serialized but not - deserialized, and will affect the hash. - """ - - # We don't make this a PrivateAttr for reasons explained in the doc comment. - time: float = Field(default_factory=time.time, alias="_time") - """ - The internal _time parameter. This is a parameter only given to nondeterminsitic - algorithms that provide no randomness seed. While this should be unset, - we allow specifying `_time` for users that want to re-use outputs of runs, - though this explicitly breaks the 'immutability' promise of runs. - """ - - model_config = ConfigDict(use_attribute_docstrings=True) diff --git a/spras/domino.py b/spras/domino.py index d3d761e1f..8ed4fe5ed 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -3,10 +3,10 @@ from typing import Optional import pandas as pd -from pydantic import ConfigDict +from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings -from spras.config.util import NondeterministicModel +from spras.config.util import BaseModel from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( add_constant, @@ -20,7 +20,7 @@ ID_PREFIX = 'ENSG0' ID_PREFIX_LEN = len(ID_PREFIX) -class DominoParams(NondeterministicModel): +class DominoParams(BaseModel): module_threshold: Optional[float] = None "the p-value threshold for considering a slice as relevant (optional)" diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 9d1396902..c26e18c8a 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -54,8 +54,8 @@ class OmicsIntegrator1Params(BaseModel): random_terminals: int = 0 "How many times to apply the given prizes to random nodes in the interactome" - seed: int = Field(default_factory=lambda _: int(time.time() * 1000)) - "The random seed to use for this run. Defaults to the current UNIX timestamp." + seed: Optional[int] = None + "The random seed to use for this run." w: int "the number of trees" @@ -198,7 +198,7 @@ def run(inputs, output_file, args, container_settings=None): if args.dummy_mode is not None and args.dummy_mode: # for custom dummy modes, add the file if args.dummy_mode == 'file': - command.extend(['--dummyMode', str(inputs["dummy_file"])]) + command.extend(['--dummyMode', str(inputs["dummy_nodes"])]) # else pass in the dummy_mode and let oi1 handle it else: command.extend(['--dummyMode', args.dummy_mode]) @@ -211,7 +211,8 @@ def run(inputs, output_file, args, container_settings=None): command.extend(['--noisyEdges', str(args.noisy_edges)]) command.extend(['--shuffledPrizes', str(args.shuffled_prizes)]) command.extend(['--randomTerminals', str(args.random_terminals)]) - command.extend(['--seed', str(args.seed)]) + if args.seed is not None: + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-1:no-conda" # no-conda version is the default run_container_and_log('Omics Integrator 1', diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 8b5c29799..bbf45f38b 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -47,8 +47,8 @@ class OmicsIntegrator2Params(BaseModel): "all" = connect to all nodes in the interactome. """ - seed: int = Field(default_factory=lambda _: int(time.time() * 1000)) - "The random seed to use for this run. Defaults to the current UNIX timestamp." + seed: Optional[int] = None + "The random seed to use for this run." model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) @@ -161,7 +161,8 @@ def run(inputs, output_file, args=None, container_settings=None): if args.dummy_mode is not None: # This argument does not follow the other naming conventions command.extend(['--dummyMode', str(args.dummy_mode)]) - command.extend(['--seed', str(args.seed)]) + if args.seed is not None: + command.extend(['--seed', str(args.seed)]) container_suffix = "omics-integrator-2:v2" run_container_and_log('Omics Integrator 2', diff --git a/spras/prm.py b/spras/prm.py index d52214083..eacf123d4 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,6 +1,6 @@ import os from abc import ABC, abstractmethod -from typing import Any, Generic, TypeVar, cast +from typing import Any, Generic, TypeVar, cast, get_args from pydantic import BaseModel @@ -41,6 +41,28 @@ def generate_inputs(data: Dataset, filename_map: dict[str, str]): """ raise NotImplementedError + @classmethod + def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: dict[str, Any], container_settings: ProcessedContainerSettings): + """ + This is similar to PRA.run, but it does pydantic logic internally to re-validate argument parameters. + """ + # awful reflection here, unfortunately: + # https://stackoverflow.com/a/71720366/7589775 + # alternatively, one could have a T_class parameter + # for PRA here, but this level of implicitness seems alright. + T_class: type[T] = get_args(cast(Any, cls).__orig_bases__[0])[0] + + # Since we just used reflection, we provide a mountain-dewey error message here + # to protect against any developer confusion. + if not issubclass(T_class, BaseModel): + raise RuntimeError("The generic passed into PRM is not a pydantic.BaseModel.") + + # (and pydantic already provides nice error messages, so we don't need to worry about + # catching this.) + T_parsed = T_class.model_validate(args) + + return cls.run(inputs, output_file, T_parsed, container_settings) + @staticmethod @abstractmethod def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_settings: ProcessedContainerSettings): diff --git a/spras/runner.py b/spras/runner.py index 209a32f42..861523a43 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -38,12 +38,14 @@ def get_algorithm(algorithm: str) -> type[PRM]: except KeyError as exc: raise NotImplementedError(f'{algorithm} is not currently supported.') from exc -def run(algorithm: str, params): +def run(algorithm: str, inputs, output_file, args, container_settings): """ A generic interface to the algorithm-specific run functions """ algorithm_runner = get_algorithm(algorithm) - algorithm_runner.run(**params) + # We can't use config.config here else we would get a cyclic dependency. + # Since args is a dict here, we use the 'run_typeless' utility PRM function. + algorithm_runner.run_typeless(inputs, output_file, args, container_settings) def get_required_inputs(algorithm: str): diff --git a/test/analysis/expected_output/test_example_summary.txt b/test/analysis/expected_output/test_example_summary.txt index 2d35023ef..f699fc28e 100644 --- a/test/analysis/expected_output/test_example_summary.txt +++ b/test/analysis/expected_output/test_example_summary.txt @@ -1,5 +1,5 @@ Name Number of nodes Number of edges Number of connected components Nodes in prize Nodes in active Nodes in dummy Nodes in sources Nodes in targets Parameter combination -test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 2 2 1 1 1 {'spras_placeholder': 'no parameters'} +test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 2 2 1 1 1 {} test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0 0 0 0 0 {'slice_threshold': 0.3, 'module_threshold': 0.05} test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 2 2 1 1 1 {'max_path_length': 3, 'local_search': 'Yes', 'rand_restarts': 10} test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt 3 2 1 2 2 1 1 1 {'flow': 1, 'capacity': 1} From 126d99f306d4e28655b0258207db4bf25b6d42f5 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 17 Jul 2025 23:08:38 +0000 Subject: [PATCH 27/68] fix: correct dummy file --- spras/omicsintegrator1.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index c26e18c8a..be43c8ba9 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -166,6 +166,7 @@ def run(inputs, output_file, args, container_settings=None): # 4. file -> connect the dummy node to a specific list of nodes provided in a file # add dummy node file to the volume if dummy_mode is not None and it is 'file' + dummy_file = None if args.dummy_mode == 'file': if inputs["dummy_nodes"] is None: raise ValueError("dummy_nodes file is required when dummy_mode is set to 'file'") @@ -197,8 +198,8 @@ def run(inputs, output_file, args, container_settings=None): # add the dummy mode argument if args.dummy_mode is not None and args.dummy_mode: # for custom dummy modes, add the file - if args.dummy_mode == 'file': - command.extend(['--dummyMode', str(inputs["dummy_nodes"])]) + if dummy_file: + command.extend(['--dummyMode', dummy_file]) # else pass in the dummy_mode and let oi1 handle it else: command.extend(['--dummyMode', args.dummy_mode]) From c705ef786f734287bc1476ac986fc70c042955ee Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 24 Jul 2025 20:36:31 +0000 Subject: [PATCH 28/68] feat: add responsenet params --- spras/responsenet.py | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/spras/responsenet.py b/spras/responsenet.py index eff83f3ea..35fda7669 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -1,4 +1,5 @@ from pathlib import Path +from pydantic import BaseModel, ConfigDict from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -8,7 +9,16 @@ from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['ResponseNet'] +__all__ = ['ResponseNet', 'ResponseNetParams'] + +class ResponseNetParams(BaseModel): + gamma: int = 10 + """ + The 'size' of the graph. The higher gamma is, the more flow + is encouraged to start from the source nodes. + """ + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) """ ResponseNet will construct a fully directed graph from the provided input file @@ -20,7 +30,7 @@ - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column - it can include bidirectional edges, but will only keep one copy of repeated edges """ -class ResponseNet(PRM): +class ResponseNet(PRM[ResponseNetParams]): required_inputs = ['sources', 'targets', 'edges'] dois = ["10.1038/ng.337"] @@ -57,19 +67,9 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(sources=None, targets=None, edges=None, output_file=None, gamma=10, container_framework="docker"): - """ - Run ResponseNet with Docker (or singularity) - @param sources: input sources (required) - @param targets: input targets (required) - @param edges: input network file (required) - @param output_file: output file name (required) - @param gamma: integer representing gamma (optional, default is 10) - @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) - """ - + def run(inputs, output_file, args, container_settings): # ensures that these parameters are required - if not sources or not targets or not edges or not output_file: + if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: raise ValueError('Required ResponseNet arguments are missing') # the data files will be mapped within this directory within the container @@ -78,23 +78,23 @@ def run(sources=None, targets=None, edges=None, output_file=None, gamma=10, cont # the tuple is for mapping the sources, targets, edges, and output volumes = list() - bind_path, sources_file = prepare_volume(sources, work_dir) + bind_path, sources_file = prepare_volume(inputs["sources"], work_dir, container_settings) volumes.append(bind_path) - bind_path, targets_file = prepare_volume(targets, work_dir) + bind_path, targets_file = prepare_volume(inputs["targets"], work_dir, container_settings) volumes.append(bind_path) - bind_path, edges_file = prepare_volume(edges, work_dir) + bind_path, edges_file = prepare_volume(inputs["edges"], work_dir, container_settings) volumes.append(bind_path) # Create a prefix for the output filename and ensure the directory exists out_dir = Path(output_file).parent out_dir.mkdir(parents=True, exist_ok=True) - bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir, container_settings) volumes.append(bind_path) mapped_out_prefix = Path(mapped_out_dir) - out_file_suffixed = out_dir / f'output_gamma{str(gamma)}.txt' + out_file_suffixed = out_dir / f'output_gamma{str(args.gamma)}.txt' # Makes the Python command to run within in the container command = ['python', @@ -103,18 +103,18 @@ def run(sources=None, targets=None, edges=None, output_file=None, gamma=10, cont '--sources_file', sources_file, '--targets_file', targets_file, '--output', str(mapped_out_prefix / 'output'), - '--gamma', str(gamma)] + '--gamma', str(args.gamma)] # choosing to run in docker or singularity container container_suffix = "responsenet:v2" # constructs a docker run call run_container_and_log('ResponseNet', - container_framework, container_suffix, command, volumes, - work_dir) + work_dir, + container_settings) # Rename the primary output file to match the desired output filename out_file_suffixed.rename(output_file) From 34719d8dd25ac386861f5c64b0912370e7900c0d Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 24 Jul 2025 20:36:55 +0000 Subject: [PATCH 29/68] style: fmt --- spras/config/config.py | 2 -- spras/config/container_schema.py | 3 +-- spras/config/util.py | 3 +-- spras/omicsintegrator1.py | 3 +-- spras/omicsintegrator2.py | 3 +-- spras/pathlinker.py | 1 - spras/responsenet.py | 1 + test/test_config.py | 1 - 8 files changed, 5 insertions(+), 12 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index b986bacdb..32661fec2 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -15,8 +15,6 @@ import copy as copy import itertools as it import os -import warnings -from collections.abc import Iterable from typing import Any import numpy as np diff --git a/spras/config/container_schema.py b/spras/config/container_schema.py index c88692678..11b13576f 100644 --- a/spras/config/container_schema.py +++ b/spras/config/container_schema.py @@ -8,9 +8,8 @@ import warnings from dataclasses import dataclass -from typing import Optional -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict from spras.config.util import CaseInsensitiveEnum diff --git a/spras/config/util.py b/spras/config/util.py index eeac6faf4..73b4dbeaf 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -4,11 +4,10 @@ only import this config file. """ -import time from enum import Enum from typing import Any -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict # https://stackoverflow.com/a/76883868/7589775 diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index be43c8ba9..a743c2061 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,8 +1,7 @@ -import time from pathlib import Path from typing import Optional -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 3d8032ca8..aed24577b 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -1,9 +1,8 @@ -import time from pathlib import Path from typing import Optional import pandas as pd -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import CaseInsensitiveEnum diff --git a/spras/pathlinker.py b/spras/pathlinker.py index f71015f0e..1b2b67675 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -1,6 +1,5 @@ import warnings from pathlib import Path -from typing import Optional from pydantic import BaseModel, ConfigDict diff --git a/spras/responsenet.py b/spras/responsenet.py index 35fda7669..2144b3136 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -1,4 +1,5 @@ from pathlib import Path + from pydantic import BaseModel, ConfigDict from spras.containers import prepare_volume, run_container_and_log diff --git a/test/test_config.py b/test/test_config.py index ae3fdb10b..b2c84121b 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -2,7 +2,6 @@ import pickle from typing import Iterable -import numpy as np import pytest from pydantic import BaseModel From 1976c4d21fb2087edf0a96ad4b7b9e31cd0067ca Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 15 Aug 2025 00:14:54 +0000 Subject: [PATCH 30/68] fix: all configs --- config/config.yaml | 4 +- config/schema.json | 155 +++++++++++++++++++++++--------- test/analysis/input/config.yaml | 58 +++++------- 3 files changed, 136 insertions(+), 81 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 27f0ad0fa..e0af6d62f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -119,8 +119,8 @@ algorithms: include: true - name: "responsenet" - params: - include: true + include: true + runs: run1: gamma: [10] diff --git a/config/schema.json b/config/schema.json index 494736275..3835749ae 100644 --- a/config/schema.json +++ b/config/schema.json @@ -22,6 +22,8 @@ "aggregate_per_algorithm": false, "components": 2, "labels": true, + "kde": false, + "remove_empty_pathways": false, "linkage": "ward", "metric": "euclidean" } @@ -248,6 +250,16 @@ "title": "Labels", "type": "boolean" }, + "kde": { + "default": false, + "title": "Kde", + "type": "boolean" + }, + "remove_empty_pathways": { + "default": false, + "title": "Remove Empty Pathways", + "type": "boolean" + }, "linkage": { "$ref": "#/$defs/MlLinkage", "default": "ward" @@ -399,7 +411,6 @@ }, "default": { "default": { - "_time": 1752611437.804319, "module_threshold": null, "slice_threshold": null } @@ -418,24 +429,6 @@ "dominoRunModel": { "additionalProperties": false, "properties": { - "_time": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "type": "number" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "description": "The internal _time parameter. This is a parameter only given to nondeterminsitic\nalgorithms that provide no randomness seed. While this should be unset,\nwe allow specifying `_time` for users that want to re-use outputs of runs,\nthough this explicitly breaks the 'immutability' promise of runs.", - "title": "Time" - }, "module_threshold": { "anyOf": [ { @@ -842,7 +835,7 @@ } ], "default": 0, - "description": "shuffled_prizes: How many times the algorithm should shuffle the prizes and re-run", + "description": "How many times the algorithm should shuffle the prizes and re-run", "title": "Shuffled Prizes" }, "random_terminals": { @@ -871,15 +864,26 @@ }, { "items": { - "type": "integer" + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] }, "type": "array" }, { "type": "string" + }, + { + "type": "null" } ], - "description": "The random seed to use for this run. Defaults to the current UNIX timestamp.", + "default": null, + "description": "The randomness seed to use.", "title": "Seed" }, "w": { @@ -897,7 +901,7 @@ "type": "string" } ], - "description": "the number of trees", + "description": "Float that affects the number of connected components, with higher values leading to more components", "title": "W" }, "b": { @@ -915,7 +919,7 @@ "type": "string" } ], - "description": "the trade-off between including more terminals and using less reliable edges", + "description": "The trade-off between including more prizes and using less reliable edgess", "title": "B" }, "d": { @@ -933,7 +937,7 @@ "type": "string" } ], - "description": "controls the maximum path-length from v0 to terminal nodes", + "description": "Controls the maximum path-length from root to terminal nodes", "title": "D" }, "mu": { @@ -943,26 +947,16 @@ }, { "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] + "type": "number" }, "type": "array" }, { "type": "string" - }, - { - "type": "null" } ], - "default": null, - "description": "controls the degree-based negative prizes (defualt 0.0)", + "default": 0.0, + "description": "Controls the degree-based negative prizes (defualt 0.0)", "title": "Mu" }, "noise": { @@ -1049,7 +1043,7 @@ } ], "default": null, - "description": "msgsteiner parameter that adds random noise to edges, which is rarely needed because the Forest --noisyEdges option is recommended instead (default 0)", + "description": "msgsteiner parameter that adds random noise to edges, which is rarely needed because the --noisyEdges option is recommended instead (default 0)", "title": "R" } }, @@ -1086,7 +1080,7 @@ "noisy_edges": null, "random_terminals": null, "dummy_mode": null, - "seed": 1752611437804 + "seed": null } }, "title": "Runs", @@ -1280,15 +1274,26 @@ }, { "items": { - "type": "integer" + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] }, "type": "array" }, { "type": "string" + }, + { + "type": "null" } ], - "description": "The random seed to use for this run. Defaults to the current UNIX timestamp.", + "default": null, + "description": "The random seed to use for this run.", "title": "Seed" } }, @@ -1353,6 +1358,64 @@ "title": "pathlinkerRunModel", "type": "object" }, + "responsenetModel": { + "additionalProperties": false, + "properties": { + "name": { + "const": "responsenet", + "title": "Name", + "type": "string" + }, + "include": { + "title": "Include", + "type": "boolean" + }, + "runs": { + "additionalProperties": { + "$ref": "#/$defs/responsenetRunModel" + }, + "default": { + "default": { + "gamma": 10 + } + }, + "title": "Runs", + "type": "object" + } + }, + "required": [ + "name", + "include" + ], + "title": "responsenetModel", + "type": "object" + }, + "responsenetRunModel": { + "additionalProperties": false, + "properties": { + "gamma": { + "anyOf": [ + { + "type": "integer" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "default": 10, + "description": "The 'size' of the graph. The higher gamma is, the more flow\nis encouraged to start from the source nodes.", + "title": "Gamma" + } + }, + "title": "responsenetRunModel", + "type": "object" + }, "rwrModel": { "additionalProperties": false, "properties": { @@ -1547,6 +1610,7 @@ "omicsintegrator1": "#/$defs/omicsintegrator1Model", "omicsintegrator2": "#/$defs/omicsintegrator2Model", "pathlinker": "#/$defs/pathlinkerModel", + "responsenet": "#/$defs/responsenetModel", "rwr": "#/$defs/rwrModel", "strwr": "#/$defs/strwrModel" }, @@ -1577,6 +1641,9 @@ { "$ref": "#/$defs/pathlinkerModel" }, + { + "$ref": "#/$defs/responsenetModel" + }, { "$ref": "#/$defs/rwrModel" }, @@ -1616,9 +1683,11 @@ "aggregate_per_algorithm": false, "components": 2, "include": false, + "kde": false, "labels": true, "linkage": "ward", - "metric": "euclidean" + "metric": "euclidean", + "remove_empty_pathways": false }, "evaluation": { "aggregate_per_algorithm": false, diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml index abde6f979..90c67a43e 100644 --- a/test/analysis/input/config.yaml +++ b/test/analysis/input/config.yaml @@ -1,37 +1,24 @@ -# The length of the hash used to identify a parameter combination +# yaml-language-server: $schema=./schema.json hash_length: 7 -# Specify the container framework. Current supported versions include 'docker' and -# 'singularity'. If container_framework is not specified, SPRAS will default to docker. -container_framework: docker - -# Only used if container_framework is set to singularity, this will unpack the singularity containers -# to the local filesystem. This is useful when PRM containers need to run inside another container, -# such as would be the case in an HTCondor/OSPool environment. -# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way -# that persists after the workflow is complete. To clean up the unpacked containers, the user must -# manually delete them. -unpack_singularity: false - -# Allow the user to configure which container registry containers should be pulled from -# Note that this assumes container names are consistent across registries, and that the -# registry being passed doesn't require authentication for pull actions -container_registry: - base_url: docker.io - # The owner or project of the registry - # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs - owner: reedcompbio +containers: + framework: docker + unpack_singularity: false + registry: + base_url: docker.io + owner: reedcompbio + hash_length: 7 algorithms: - name: "pathlinker" - params: - include: true + include: true + runs: run1: k: range(100,201,100) - name: "omicsintegrator1" - params: - include: true + include: true + runs: run1: b: [5, 6] w: np.linspace(0,5,2) @@ -39,8 +26,8 @@ algorithms: dummy_mode: ["file"] - name: "omicsintegrator2" - params: - include: true + include: true + runs: run1: b: [4] g: [0] @@ -49,27 +36,26 @@ algorithms: g: [3] - name: "meo" - params: - include: true + include: true + runs: run1: max_path_length: [3] - local_search: ["Yes"] + local_search: [true] rand_restarts: [10] - name: "mincostflow" - params: - include: true + include: true + runs: run1: flow: [1] # The flow must be an int capacity: [1] - name: "allpairs" - params: - include: true + include: true - name: "domino" - params: - include: true + include: true + runs: run1: slice_threshold: [0.3] module_threshold: [0.05] From b31b857944839ed63d865fa090b10456f4a91840 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 15 Aug 2025 00:43:26 +0000 Subject: [PATCH 31/68] refactor: the central validate_required_run_args --- spras/allpairs.py | 3 +-- spras/btb.py | 9 ++------- spras/domino.py | 3 +-- spras/meo.py | 4 +--- spras/mincostflow.py | 5 +---- spras/omicsintegrator1.py | 3 +-- spras/omicsintegrator2.py | 4 +--- spras/pathlinker.py | 4 +--- spras/prm.py | 28 +++++++++++++++++++++++++++- spras/responsenet.py | 4 +--- spras/rwr.py | 3 +-- spras/strwr.py | 3 +-- test/BowTieBuilder/test_btb.py | 2 +- 13 files changed, 40 insertions(+), 35 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index 14187ee2c..89595ffef 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -73,8 +73,7 @@ def generate_inputs(data: Dataset, filename_map): @staticmethod def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - if not inputs["nodetypes"] or not inputs["network"] or not inputs["directed_flag"]: - raise ValueError('Required All Pairs Shortest Paths arguments are missing') + AllPairs.validate_required_run_args(inputs) work_dir = '/apsp' diff --git a/spras/btb.py b/spras/btb.py index 4cf8fa76c..379d96f47 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -66,15 +66,10 @@ def generate_inputs(data, filename_map): @staticmethod def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() + BowTieBuilder.validate_required_run_args(inputs) + # Tests for pytest (docker container also runs this) # Testing out here avoids the trouble that container errors provide - - if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: - raise ValueError('Required BowTieBuilder arguments are missing') - - if not Path(inputs["sources"]).exists() or not Path(inputs["targets"]).exists() or not Path(inputs["edges"]).exists(): - raise ValueError('Missing input file') - # Testing for btb index errors # TODO: This error will never actually occur if the inputs are passed through # `generate_inputs`. See the discussion about removing this or making this a habit at diff --git a/spras/domino.py b/spras/domino.py index 2926485fb..dc43af9b9 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -79,10 +79,9 @@ def generate_inputs(data, filename_map): def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = DominoParams() + DOMINO.validate_required_run_args(inputs) # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. - if not inputs["network"] or not inputs["active_genes"]: - raise ValueError('Required DOMINO arguments are missing') work_dir = '/spras' diff --git a/spras/meo.py b/spras/meo.py index 7019d6053..205eb4ca8 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -155,9 +155,7 @@ def run(inputs, output_file=None, args=None, container_settings=None): """ if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = MEOParams() - - if inputs["edges"] is None or inputs["sources"] is None or inputs["targets"] is None: - raise ValueError('Required Maximum Edge Orientation arguments are missing') + MEO.validate_required_run_args(inputs) work_dir = '/spras' diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 2d8961763..81d9198d4 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -76,10 +76,7 @@ def generate_inputs(data, filename_map): def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = MinCostFlowParams() - - # ensures that these parameters are required - if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: - raise ValueError('Required MinCostFlow arguments are missing') + MinCostFlow.validate_required_run_args(inputs) # the data files will be mapped within this directory within the container work_dir = '/mincostflow' diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 5a33a3e1b..56ccbe8ba 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -142,8 +142,7 @@ def generate_inputs(data, filename_map): @staticmethod def run(inputs, output_file, args, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - if inputs["edges"] is None or inputs["prizes"] is None or output_file is None: - raise ValueError('Required Omics Integrator 1 arguments are missing') + OmicsIntegrator1.validate_required_run_args(inputs, ["dummy_nodes"]) work_dir = '/spras' diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 2ee84ebd8..00ef867c5 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -128,9 +128,7 @@ def run(inputs, output_file, args=None, container_settings=None): """ if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = OmicsIntegrator2Params() - - if inputs["edges"] is None or inputs["prizes"] is None: - raise ValueError('Required Omics Integrator 2 arguments are missing') + OmicsIntegrator2.validate_required_run_args(inputs) work_dir = '/spras' diff --git a/spras/pathlinker.py b/spras/pathlinker.py index faa2f4fd0..b3e2f58c2 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -77,9 +77,7 @@ def generate_inputs(data, filename_map): def run(inputs, output_file, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = PathLinkerParams() - - if not inputs["nodetypes"] or not inputs["network"]: - raise ValueError('Required PathLinker arguments are missing') + PathLinker.validate_required_run_args(inputs) work_dir = '/spras' diff --git a/spras/prm.py b/spras/prm.py index e112dac43..d5fb40bd1 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,7 +1,7 @@ import os from abc import ABC, abstractmethod from typing import Any, Generic, TypeVar, cast, get_args - +from pathlib import Path from pydantic import BaseModel from spras.config.container_schema import ProcessedContainerSettings @@ -82,3 +82,29 @@ def validate_required_inputs(cls, filename_map: dict[str, str]): for input_type in cls.required_inputs: if input_type not in filename_map: raise ValueError("{input_type} filename is missing") + + @classmethod + def validate_required_run_args(cls, inputs: dict[str, str | os.PathLike], relax: list[str] = []): + """ + Validates the `inputs` parameter for `PRM#run`. + + @param inputs: See `PRM#run`. + @param relax: List of inputs that aren't required: if they are specified, they should be valid path + """ + + # Check that `relax` is a valid list + for entry in relax: + if entry not in cls.required_inputs: + raise RuntimeError(f"{relax} is not contained in this PRM's required inputs ({cls.required_inputs}). This should have been caught in testing.") + + for input_type in cls.required_inputs: + if input_type not in inputs or not inputs[input_type]: + # Ignore relaxed inputs + if input_type in relax: + continue + raise ValueError(f'Required input "{input_type}" is not set') + + path = Path(inputs[input_type]) + if not path.exists(): + raise OSError(f'Required input "{input_type}" is pointing to a missing file "{path}".') + diff --git a/spras/responsenet.py b/spras/responsenet.py index 1d059289a..b87fe6db0 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -65,9 +65,7 @@ def generate_inputs(data, filename_map): @staticmethod def run(inputs, output_file, args, container_settings): - # ensures that these parameters are required - if not inputs["sources"] or not inputs["targets"] or not inputs["edges"]: - raise ValueError('Required ResponseNet arguments are missing') + ResponseNet.validate_required_run_args(inputs) # the data files will be mapped within this directory within the container work_dir = '/ResponseNet' diff --git a/spras/rwr.py b/spras/rwr.py index 1600ca039..8bc64b5cd 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -46,8 +46,7 @@ def generate_inputs(data, filename_map): @staticmethod def run(inputs, output_file, args, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - if not inputs["nodes"] or not inputs["network"]: - raise ValueError('Required RWR arguments are missing') + RWR.validate_required_run_args(inputs) with Path(inputs["network"]).open() as network_f: for line in network_f: diff --git a/spras/strwr.py b/spras/strwr.py index d09bbc821..5f788f774 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -47,8 +47,7 @@ def generate_inputs(data, filename_map): @staticmethod def run(inputs, output_file, args, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() - if not inputs["sources"] or not inputs["targets"] or not inputs["network"] or not output_file: - raise ValueError('Required local_neighborhood arguments are missing') + ST_RWR.validate_required_run_args(inputs) with Path(inputs["network"]).open() as network_f: for line in network_f: diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index 37fa00070..94d762196 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -43,7 +43,7 @@ def test_btb_missing(self): Run the BowTieBuilder algorithm with missing files """ def test_btb_file(self): - with pytest.raises(ValueError): + with pytest.raises(OSError): BTB.run({"sources": Path(TEST_DIR, 'input', 'unknown.txt'), "targets": Path(TEST_DIR, 'input', 'target.txt'), "edges": Path(TEST_DIR, 'input', 'edges.txt')}, From 2d0c3ec7f85704376e539c70662e9d451910f042 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 15 Aug 2025 01:16:13 +0000 Subject: [PATCH 32/68] test: use correct raises err for btb --- test/BowTieBuilder/test_btb.py | 17 ++++++++--------- test/OmicsIntegrator1/test_oi1.py | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index 94d762196..26c4d737f 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -22,17 +22,17 @@ class TestBowTieBuilder: Run the BowTieBuilder algorithm with missing arguments """ def test_btb_missing(self): - with pytest.raises(ValueError): + with pytest.raises(OSError): # No edges BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), "sources": Path(TEST_DIR, 'input', 'source.txt')}, output_file=OUT_FILE_DEFAULT) - with pytest.raises(ValueError): + with pytest.raises(OSError): # No source BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), "edges": Path(TEST_DIR, 'input', 'edges.txt')}, output_file=OUT_FILE_DEFAULT) - with pytest.raises(ValueError): + with pytest.raises(OSError): # No target BTB.run({"sources": Path(TEST_DIR, 'input', 'source.txt'), "edges": Path(TEST_DIR, 'input', 'edges.txt')}, @@ -126,12 +126,11 @@ def test_disjoint2(self): Run the BowTieBuilder algorithm with a missing input file """ def test_missing_file(self): - with pytest.raises(ValueError): - with pytest.raises(OSError): - BTB.run({"edges": Path(TEST_DIR, 'input', 'missing.txt'), - "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), - "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, - output_file=OUT_FILE_DEFAULT) + with pytest.raises(OSError): + BTB.run({"edges": Path(TEST_DIR, 'input', 'missing.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT) """ diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index fad4627e0..b05dc401a 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -121,5 +121,5 @@ def test_oi1_singularity(self): w=5, b=1, d=10), - container_framework="singularity") + container_settings="singularity") assert out_path.exists() From b8ee227ac09ca09081eccf4b8dae68a3ac532f1c Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 14 Aug 2025 23:23:38 -0700 Subject: [PATCH 33/68] better err handling for too many keys --- spras/prm.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spras/prm.py b/spras/prm.py index d5fb40bd1..6ee65cf0c 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -97,6 +97,7 @@ def validate_required_run_args(cls, inputs: dict[str, str | os.PathLike], relax: if entry not in cls.required_inputs: raise RuntimeError(f"{relax} is not contained in this PRM's required inputs ({cls.required_inputs}). This should have been caught in testing.") + # Check that all non-relaxed required inputs are present for input_type in cls.required_inputs: if input_type not in inputs or not inputs[input_type]: # Ignore relaxed inputs @@ -107,4 +108,8 @@ def validate_required_run_args(cls, inputs: dict[str, str | os.PathLike], relax: path = Path(inputs[input_type]) if not path.exists(): raise OSError(f'Required input "{input_type}" is pointing to a missing file "{path}".') - + + # Then, check that all inputs are required inputs (to prevent typos / catch errors when inputs are updated) + for input_type in inputs.keys(): + if input_type not in cls.required_inputs: + raise ValueError(f'Extra input "{input_type}" was provided but is not present in required inputs ({cls.required_inputs})') From b61782554944c03ca8a182c50994ec46beec534d Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 6 Sep 2025 05:51:48 +0000 Subject: [PATCH 34/68] chore: drop schema see #358 --- config/config.yaml | 2 - config/egfr.yaml | 2 - config/schema.json | 1710 ------------------------------- test/analysis/input/config.yaml | 1 - util/update_schema.py | 13 - 5 files changed, 1728 deletions(-) delete mode 100644 config/schema.json delete mode 100644 util/update_schema.py diff --git a/config/config.yaml b/config/config.yaml index e0af6d62f..5382beea1 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,5 +1,3 @@ -# yaml-language-server: $schema=./schema.json - # Global workflow control # The length of the hash used to identify a parameter combination diff --git a/config/egfr.yaml b/config/egfr.yaml index 6a07727d3..7b849a8cc 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -1,5 +1,3 @@ -# yaml-language-server: $schema=./schema.json - hash_length: 7 containers: framework: docker diff --git a/config/schema.json b/config/schema.json deleted file mode 100644 index 3835749ae..000000000 --- a/config/schema.json +++ /dev/null @@ -1,1710 +0,0 @@ -{ - "$defs": { - "Analysis": { - "additionalProperties": false, - "properties": { - "summary": { - "$ref": "#/$defs/SummaryAnalysis", - "default": { - "include": false - } - }, - "cytoscape": { - "$ref": "#/$defs/CytoscapeAnalysis", - "default": { - "include": false - } - }, - "ml": { - "$ref": "#/$defs/MlAnalysis", - "default": { - "include": false, - "aggregate_per_algorithm": false, - "components": 2, - "labels": true, - "kde": false, - "remove_empty_pathways": false, - "linkage": "ward", - "metric": "euclidean" - } - }, - "evaluation": { - "$ref": "#/$defs/EvaluationAnalysis", - "default": { - "include": false, - "aggregate_per_algorithm": false - } - } - }, - "title": "Analysis", - "type": "object" - }, - "ContainerFramework": { - "enum": [ - "docker", - "singularity", - "dsub" - ], - "title": "ContainerFramework", - "type": "string" - }, - "ContainerRegistry": { - "additionalProperties": false, - "properties": { - "base_url": { - "default": "docker.io", - "description": "The domain of the registry", - "title": "Base Url", - "type": "string" - }, - "owner": { - "default": "reedcompbio", - "description": "The owner or project of the registry", - "title": "Owner", - "type": "string" - } - }, - "title": "ContainerRegistry", - "type": "object" - }, - "ContainerSettings": { - "additionalProperties": false, - "properties": { - "framework": { - "$ref": "#/$defs/ContainerFramework", - "default": "docker" - }, - "unpack_singularity": { - "default": false, - "title": "Unpack Singularity", - "type": "boolean" - }, - "registry": { - "$ref": "#/$defs/ContainerRegistry" - }, - "hash_length": { - "default": 7, - "title": "Hash Length", - "type": "integer" - } - }, - "required": [ - "registry" - ], - "title": "ContainerSettings", - "type": "object" - }, - "CytoscapeAnalysis": { - "additionalProperties": false, - "properties": { - "include": { - "title": "Include", - "type": "boolean" - } - }, - "required": [ - "include" - ], - "title": "CytoscapeAnalysis", - "type": "object" - }, - "Dataset": { - "additionalProperties": false, - "properties": { - "label": { - "title": "Label", - "type": "string" - }, - "node_files": { - "items": { - "type": "string" - }, - "title": "Node Files", - "type": "array" - }, - "edge_files": { - "items": { - "type": "string" - }, - "title": "Edge Files", - "type": "array" - }, - "other_files": { - "items": { - "type": "string" - }, - "title": "Other Files", - "type": "array" - }, - "data_dir": { - "title": "Data Dir", - "type": "string" - } - }, - "required": [ - "label", - "node_files", - "edge_files", - "other_files", - "data_dir" - ], - "title": "Dataset", - "type": "object" - }, - "DummyMode": { - "enum": [ - "terminals", - "others", - "all" - ], - "title": "DummyMode", - "type": "string" - }, - "EvaluationAnalysis": { - "additionalProperties": false, - "properties": { - "include": { - "title": "Include", - "type": "boolean" - }, - "aggregate_per_algorithm": { - "default": false, - "title": "Aggregate Per Algorithm", - "type": "boolean" - } - }, - "required": [ - "include" - ], - "title": "EvaluationAnalysis", - "type": "object" - }, - "GoldStandard": { - "additionalProperties": false, - "properties": { - "label": { - "title": "Label", - "type": "string" - }, - "node_files": { - "items": { - "type": "string" - }, - "title": "Node Files", - "type": "array" - }, - "data_dir": { - "title": "Data Dir", - "type": "string" - }, - "dataset_labels": { - "items": { - "type": "string" - }, - "title": "Dataset Labels", - "type": "array" - } - }, - "required": [ - "label", - "node_files", - "data_dir", - "dataset_labels" - ], - "title": "GoldStandard", - "type": "object" - }, - "Locations": { - "additionalProperties": false, - "properties": { - "reconstruction_dir": { - "title": "Reconstruction Dir", - "type": "string" - } - }, - "required": [ - "reconstruction_dir" - ], - "title": "Locations", - "type": "object" - }, - "MlAnalysis": { - "additionalProperties": false, - "properties": { - "include": { - "title": "Include", - "type": "boolean" - }, - "aggregate_per_algorithm": { - "default": false, - "title": "Aggregate Per Algorithm", - "type": "boolean" - }, - "components": { - "default": 2, - "title": "Components", - "type": "integer" - }, - "labels": { - "default": true, - "title": "Labels", - "type": "boolean" - }, - "kde": { - "default": false, - "title": "Kde", - "type": "boolean" - }, - "remove_empty_pathways": { - "default": false, - "title": "Remove Empty Pathways", - "type": "boolean" - }, - "linkage": { - "$ref": "#/$defs/MlLinkage", - "default": "ward" - }, - "metric": { - "$ref": "#/$defs/MlMetric", - "default": "euclidean" - } - }, - "required": [ - "include" - ], - "title": "MlAnalysis", - "type": "object" - }, - "MlLinkage": { - "enum": [ - "ward", - "complete", - "average", - "single" - ], - "title": "MlLinkage", - "type": "string" - }, - "MlMetric": { - "enum": [ - "euclidean", - "manhattan", - "cosine" - ], - "title": "MlMetric", - "type": "string" - }, - "ReconstructionSettings": { - "additionalProperties": false, - "properties": { - "locations": { - "$ref": "#/$defs/Locations" - } - }, - "required": [ - "locations" - ], - "title": "ReconstructionSettings", - "type": "object" - }, - "SummaryAnalysis": { - "additionalProperties": false, - "properties": { - "include": { - "title": "Include", - "type": "boolean" - } - }, - "required": [ - "include" - ], - "title": "SummaryAnalysis", - "type": "object" - }, - "allpairsModel": { - "additionalProperties": false, - "properties": { - "name": { - "const": "allpairs", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/allpairsRunModel" - }, - "default": { - "default": {} - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include" - ], - "title": "allpairsModel", - "type": "object" - }, - "allpairsRunModel": { - "additionalProperties": false, - "properties": {}, - "title": "allpairsRunModel", - "type": "object" - }, - "bowtiebuilderModel": { - "additionalProperties": false, - "properties": { - "name": { - "const": "bowtiebuilder", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/bowtiebuilderRunModel" - }, - "default": { - "default": {} - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include" - ], - "title": "bowtiebuilderModel", - "type": "object" - }, - "bowtiebuilderRunModel": { - "additionalProperties": false, - "properties": {}, - "title": "bowtiebuilderRunModel", - "type": "object" - }, - "dominoModel": { - "additionalProperties": false, - "properties": { - "name": { - "const": "domino", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/dominoRunModel" - }, - "default": { - "default": { - "module_threshold": null, - "slice_threshold": null - } - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include" - ], - "title": "dominoModel", - "type": "object" - }, - "dominoRunModel": { - "additionalProperties": false, - "properties": { - "module_threshold": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "the p-value threshold for considering a slice as relevant (optional)", - "title": "Module Threshold" - }, - "slice_threshold": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "the p-value threshold for considering a putative module as final module (optional)", - "title": "Slice Threshold" - } - }, - "title": "dominoRunModel", - "type": "object" - }, - "meoModel": { - "additionalProperties": false, - "properties": { - "name": { - "const": "meo", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/meoRunModel" - }, - "default": { - "default": { - "max_path_length": null, - "local_search": null, - "rand_restarts": null - } - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include" - ], - "title": "meoModel", - "type": "object" - }, - "meoRunModel": { - "additionalProperties": false, - "properties": { - "max_path_length": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "the maximal length of a path from sources and targets to orient.", - "title": "Max Path Length" - }, - "local_search": { - "anyOf": [ - { - "type": "boolean" - }, - { - "items": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": null, - "description": "a boolean parameter that enables MEO's local search functionality.\nSee \"Improving approximations with local search\" in the associated paper\nfor more information.", - "title": "Local Search" - }, - "rand_restarts": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "The number of random restarts to do.", - "title": "Rand Restarts" - } - }, - "title": "meoRunModel", - "type": "object" - }, - "mincostflowModel": { - "additionalProperties": false, - "properties": { - "name": { - "const": "mincostflow", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/mincostflowRunModel" - }, - "default": { - "default": { - "flow": null, - "capacity": null - } - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include" - ], - "title": "mincostflowModel", - "type": "object" - }, - "mincostflowRunModel": { - "additionalProperties": false, - "properties": { - "flow": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "amount of flow going through the graph", - "title": "Flow" - }, - "capacity": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "amount of capacity allowed on each edge", - "title": "Capacity" - } - }, - "title": "mincostflowRunModel", - "type": "object" - }, - "omicsintegrator1Model": { - "additionalProperties": false, - "properties": { - "name": { - "const": "omicsintegrator1", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/omicsintegrator1RunModel" - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include", - "runs" - ], - "title": "omicsintegrator1Model", - "type": "object" - }, - "omicsintegrator1RunModel": { - "additionalProperties": false, - "properties": { - "dummy_mode": { - "anyOf": [ - { - "type": "string" - }, - { - "items": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Dummy Mode" - }, - "mu_squared": { - "anyOf": [ - { - "type": "boolean" - }, - { - "items": { - "type": "boolean" - }, - "type": "array" - } - ], - "default": false, - "title": "Mu Squared" - }, - "exclude_terms": { - "anyOf": [ - { - "type": "boolean" - }, - { - "items": { - "type": "boolean" - }, - "type": "array" - } - ], - "default": false, - "title": "Exclude Terms" - }, - "noisy_edges": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "type": "integer" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "default": 0, - "description": "How many times you would like to add noise to the given edge values and re-run the algorithm.", - "title": "Noisy Edges" - }, - "shuffled_prizes": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "type": "integer" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "default": 0, - "description": "How many times the algorithm should shuffle the prizes and re-run", - "title": "Shuffled Prizes" - }, - "random_terminals": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "type": "integer" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "default": 0, - "description": "How many times to apply the given prizes to random nodes in the interactome", - "title": "Random Terminals" - }, - "seed": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "The randomness seed to use.", - "title": "Seed" - }, - "w": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "type": "integer" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "description": "Float that affects the number of connected components, with higher values leading to more components", - "title": "W" - }, - "b": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "type": "number" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "description": "The trade-off between including more prizes and using less reliable edgess", - "title": "B" - }, - "d": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "type": "integer" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "description": "Controls the maximum path-length from root to terminal nodes", - "title": "D" - }, - "mu": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "type": "number" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "default": 0.0, - "description": "Controls the degree-based negative prizes (defualt 0.0)", - "title": "Mu" - }, - "noise": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations", - "title": "Noise" - }, - "g": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "(Gamma) multiplicative edge penalty from degree of endpoints", - "title": "G" - }, - "r": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "msgsteiner parameter that adds random noise to edges, which is rarely needed because the --noisyEdges option is recommended instead (default 0)", - "title": "R" - } - }, - "required": [ - "w", - "b", - "d" - ], - "title": "omicsintegrator1RunModel", - "type": "object" - }, - "omicsintegrator2Model": { - "additionalProperties": false, - "properties": { - "name": { - "const": "omicsintegrator2", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/omicsintegrator2RunModel" - }, - "default": { - "default": { - "w": 6.0, - "b": 1.0, - "g": 20.0, - "noise": null, - "noisy_edges": null, - "random_terminals": null, - "dummy_mode": null, - "seed": null - } - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include" - ], - "title": "omicsintegrator2Model", - "type": "object" - }, - "omicsintegrator2RunModel": { - "additionalProperties": false, - "properties": { - "w": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "type": "number" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "default": 6, - "description": "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode", - "title": "W" - }, - "b": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "type": "number" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "default": 1, - "description": "Beta: scaling factor of prizes", - "title": "B" - }, - "g": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "type": "number" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "default": 20, - "description": "Gamma: multiplicative edge penalty from degree of endpoints", - "title": "G" - }, - "noise": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations.", - "title": "Noise" - }, - "noisy_edges": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "An integer specifying how many times to add noise to the given edge values and re-run.", - "title": "Noisy Edges" - }, - "random_terminals": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "An integer specifying how many times to apply your given prizes to random nodes in the interactome and re-run", - "title": "Random Terminals" - }, - "dummy_mode": { - "anyOf": [ - { - "$ref": "#/$defs/DummyMode" - }, - { - "items": { - "anyOf": [ - { - "$ref": "#/$defs/DummyMode" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Tells the program which nodes in the interactome to connect the dummy node to. (default: terminals)\n \"terminals\" = connect to all terminals\n \"others\" = connect to all nodes except for terminals\n \"all\" = connect to all nodes in the interactome.", - "title": "Dummy Mode" - }, - "seed": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "The random seed to use for this run.", - "title": "Seed" - } - }, - "title": "omicsintegrator2RunModel", - "type": "object" - }, - "pathlinkerModel": { - "additionalProperties": false, - "properties": { - "name": { - "const": "pathlinker", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/pathlinkerRunModel" - }, - "default": { - "default": { - "k": 100 - } - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include" - ], - "title": "pathlinkerModel", - "type": "object" - }, - "pathlinkerRunModel": { - "additionalProperties": false, - "properties": { - "k": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "type": "integer" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "default": 100, - "description": "path length", - "title": "K" - } - }, - "title": "pathlinkerRunModel", - "type": "object" - }, - "responsenetModel": { - "additionalProperties": false, - "properties": { - "name": { - "const": "responsenet", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/responsenetRunModel" - }, - "default": { - "default": { - "gamma": 10 - } - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include" - ], - "title": "responsenetModel", - "type": "object" - }, - "responsenetRunModel": { - "additionalProperties": false, - "properties": { - "gamma": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "type": "integer" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "default": 10, - "description": "The 'size' of the graph. The higher gamma is, the more flow\nis encouraged to start from the source nodes.", - "title": "Gamma" - } - }, - "title": "responsenetRunModel", - "type": "object" - }, - "rwrModel": { - "additionalProperties": false, - "properties": { - "name": { - "const": "rwr", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/rwrRunModel" - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include", - "runs" - ], - "title": "rwrModel", - "type": "object" - }, - "rwrRunModel": { - "additionalProperties": false, - "properties": { - "threshold": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "type": "integer" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "description": "The number of nodes to return", - "title": "Threshold" - }, - "alpha": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "The chance of a restart during the random walk", - "title": "Alpha" - } - }, - "required": [ - "threshold" - ], - "title": "rwrRunModel", - "type": "object" - }, - "strwrModel": { - "additionalProperties": false, - "properties": { - "name": { - "const": "strwr", - "title": "Name", - "type": "string" - }, - "include": { - "title": "Include", - "type": "boolean" - }, - "runs": { - "additionalProperties": { - "$ref": "#/$defs/strwrRunModel" - }, - "title": "Runs", - "type": "object" - } - }, - "required": [ - "name", - "include", - "runs" - ], - "title": "strwrModel", - "type": "object" - }, - "strwrRunModel": { - "additionalProperties": false, - "properties": { - "threshold": { - "anyOf": [ - { - "type": "integer" - }, - { - "items": { - "type": "integer" - }, - "type": "array" - }, - { - "type": "string" - } - ], - "description": "The number of nodes to return", - "title": "Threshold" - }, - "alpha": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "The chance of a restart during the random walk", - "title": "Alpha" - } - }, - "required": [ - "threshold" - ], - "title": "strwrRunModel", - "type": "object" - } - }, - "additionalProperties": false, - "properties": { - "containers": { - "$ref": "#/$defs/ContainerSettings" - }, - "hash_length": { - "default": 7, - "description": "The length of the hash used to identify a parameter combination", - "title": "Hash Length", - "type": "integer" - }, - "algorithms": { - "items": { - "discriminator": { - "mapping": { - "allpairs": "#/$defs/allpairsModel", - "bowtiebuilder": "#/$defs/bowtiebuilderModel", - "domino": "#/$defs/dominoModel", - "meo": "#/$defs/meoModel", - "mincostflow": "#/$defs/mincostflowModel", - "omicsintegrator1": "#/$defs/omicsintegrator1Model", - "omicsintegrator2": "#/$defs/omicsintegrator2Model", - "pathlinker": "#/$defs/pathlinkerModel", - "responsenet": "#/$defs/responsenetModel", - "rwr": "#/$defs/rwrModel", - "strwr": "#/$defs/strwrModel" - }, - "propertyName": "name" - }, - "oneOf": [ - { - "$ref": "#/$defs/allpairsModel" - }, - { - "$ref": "#/$defs/bowtiebuilderModel" - }, - { - "$ref": "#/$defs/dominoModel" - }, - { - "$ref": "#/$defs/meoModel" - }, - { - "$ref": "#/$defs/mincostflowModel" - }, - { - "$ref": "#/$defs/omicsintegrator1Model" - }, - { - "$ref": "#/$defs/omicsintegrator2Model" - }, - { - "$ref": "#/$defs/pathlinkerModel" - }, - { - "$ref": "#/$defs/responsenetModel" - }, - { - "$ref": "#/$defs/rwrModel" - }, - { - "$ref": "#/$defs/strwrModel" - } - ] - }, - "title": "Algorithms", - "type": "array" - }, - "datasets": { - "items": { - "$ref": "#/$defs/Dataset" - }, - "title": "Datasets", - "type": "array" - }, - "gold_standards": { - "default": [], - "items": { - "$ref": "#/$defs/GoldStandard" - }, - "title": "Gold Standards", - "type": "array" - }, - "analysis": { - "$ref": "#/$defs/Analysis", - "default": { - "summary": { - "include": false - }, - "cytoscape": { - "include": false - }, - "ml": { - "aggregate_per_algorithm": false, - "components": 2, - "include": false, - "kde": false, - "labels": true, - "linkage": "ward", - "metric": "euclidean", - "remove_empty_pathways": false - }, - "evaluation": { - "aggregate_per_algorithm": false, - "include": false - } - } - }, - "reconstruction_settings": { - "$ref": "#/$defs/ReconstructionSettings" - } - }, - "required": [ - "containers", - "algorithms", - "datasets", - "reconstruction_settings" - ], - "title": "RawConfig", - "type": "object" -} \ No newline at end of file diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml index 90c67a43e..392769d4c 100644 --- a/test/analysis/input/config.yaml +++ b/test/analysis/input/config.yaml @@ -1,4 +1,3 @@ -# yaml-language-server: $schema=./schema.json hash_length: 7 containers: diff --git a/util/update_schema.py b/util/update_schema.py deleted file mode 100644 index c6a7bedca..000000000 --- a/util/update_schema.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -Updates config/schema.json. -This should be done whenever a new algorithm is introduced, -or the config is otherwise directly changed. -""" - -import json -from pathlib import Path - -from spras.config.schema import RawConfig - -config_schema = RawConfig.model_json_schema() -Path('config/schema.json').write_text(json.dumps(config_schema, indent=2)) From a77263f481e08f387566a21d5cca62a402410b18 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 6 Sep 2025 06:14:20 +0000 Subject: [PATCH 35/68] test: fix --- test/analysis/input/egfr.yaml | 37 ++++++++++++++++++----------------- test/test_util.py | 2 +- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml index da4560df9..958fcc7fd 100644 --- a/test/analysis/input/egfr.yaml +++ b/test/analysis/input/egfr.yaml @@ -1,26 +1,27 @@ # The length of the hash used to identify a parameter combination hash_length: 7 -# Specify the container framework. Current supported versions include 'docker' and -# 'singularity'. If container_framework is not specified, SPRAS will default to docker. -container_framework: docker +containers: + # Specify the container framework. Current supported versions include 'docker' and + # 'singularity'. If container_framework is not specified, SPRAS will default to docker. + framework: docker -# Only used if container_framework is set to singularity, this will unpack the singularity containers -# to the local filesystem. This is useful when PRM containers need to run inside another container, -# such as would be the case in an HTCondor/OSPool environment. -# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way -# that persists after the workflow is complete. To clean up the unpacked containers, the user must -# manually delete them. -unpack_singularity: false + # Only used if container_framework is set to singularity, this will unpack the singularity containers + # to the local filesystem. This is useful when PRM containers need to run inside another container, + # such as would be the case in an HTCondor/OSPool environment. + # NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way + # that persists after the workflow is complete. To clean up the unpacked containers, the user must + # manually delete them. + unpack_singularity: false -# Allow the user to configure which container registry containers should be pulled from -# Note that this assumes container names are consistent across registries, and that the -# registry being passed doesn't require authentication for pull actions -container_registry: - base_url: docker.io - # The owner or project of the registry - # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs - owner: reedcompbio + # Allow the user to configure which container registry containers should be pulled from + # Note that this assumes container names are consistent across registries, and that the + # registry being passed doesn't require authentication for pull actions + registry: + base_url: docker.io + # The owner or project of the registry + # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs + owner: reedcompbio algorithms: - name: pathlinker diff --git a/test/test_util.py b/test/test_util.py index 2a25fc0d1..c18a35f75 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -41,7 +41,7 @@ def test_hash_params_sha1_base32(self): ('test/OmicsIntegrator1/output', PurePosixPath('/spras'), '/spras/TNDO5TR/output'), ('../src', '/spras', '/spras/NNBVZ6X/src')]) def test_prepare_volume(self, filename, volume_base, expected_filename): - _, container_filename = prepare_volume(filename, volume_base) + _, container_filename = prepare_volume(filename, volume_base, config.config.container_settings) assert container_filename == expected_filename def test_convert_docker_path(self): From 05e40c43f924c3e85625cf2062e20aef384236d6 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 28 Oct 2025 09:59:43 -0700 Subject: [PATCH 36/68] Update spras/config/algorithms.py Co-authored-by: Neha Talluri <78840540+ntalluri@users.noreply.github.com> --- spras/config/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 889efab35..2a7620ffd 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -1,5 +1,5 @@ """ -Dynamic construction of algoithm parameters with runtime type information for +Dynamic construction of algorithm parameters with runtime type information for parameter combinations. This has been isolated from schema.py as it is not declarative, and rather mainly contains validators and lower-level pydantic code. """ From cea83b2a6feb97b4b74516644a74ff155073ec74 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 29 Oct 2025 06:02:42 +0000 Subject: [PATCH 37/68] test(btb): correct tests --- test/BowTieBuilder/test_btb.py | 10 +++++----- test/DOMINO/test_domino.py | 6 ++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index f9c15f03c..0f87e286a 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -304,8 +304,8 @@ def test_weight_one(self): @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') def test_btb_singularity(self): OUT_FILE_DEFAULT.unlink(missing_ok=True) - BTB.run(edges=Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), - sources=Path(TEST_DIR, 'input', 'btb-sources.txt'), - targets=Path(TEST_DIR, 'input', 'btb-targets.txt'), - output_file=OUT_FILE_DEFAULT, - container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) + BTB.run({"edges": Path(TEST_DIR, 'input', 'source-to-source-edges.txt'), + "sources": Path(TEST_DIR, 'input', 'btb-sources.txt'), + "targets": Path(TEST_DIR, 'input', 'btb-targets.txt')}, + output_file=OUT_FILE_DEFAULT, + container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index e757d0029..4bf102f2f 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -31,8 +31,7 @@ class TestDOMINO: def test_domino_required(self): # Only include required arguments - out_path = Path(OUT_FILE_DEFAULT) - out_path.unlink(missing_ok=True) + OUT_FILE_DEFAULT.unlink(missing_ok=True) DOMINO.run({"network": TEST_DIR / 'input' / 'simple' / 'domino-network.txt', "active_genes": TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt'}, output_file=OUT_FILE_DEFAULT) @@ -42,8 +41,7 @@ def test_domino_required(self): def test_domino_optional(self): # Include optional arguments - out_path = Path(OUT_FILE_OPTIONAL) - out_path.unlink(missing_ok=True) + OUT_FILE_DEFAULT.unlink(missing_ok=True) DOMINO.run({"network": TEST_DIR / 'input' / 'simple' / 'domino-network.txt', "active_genes": TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt'}, output_file=OUT_FILE_OPTIONAL, From d9f251dcee3baf3ce78fc152cf4e623921f403d9 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 29 Oct 2025 06:05:06 +0000 Subject: [PATCH 38/68] style: fmt --- spras/config/config.py | 2 +- spras/prm.py | 12 +++++++----- spras/responsenet.py | 1 - spras/rwr.py | 1 - test/DOMINO/test_domino.py | 2 +- test/MEO/test_meo.py | 2 +- test/MinCostFlow/test_mcf.py | 2 +- test/OmicsIntegrator1/test_oi1.py | 2 +- test/OmicsIntegrator2/test_oi2.py | 2 +- test/PathLinker/test_pathlinker.py | 2 +- test/RWR/test_RWR.py | 2 +- test/ST_RWR/test_STRWR.py | 2 +- 12 files changed, 16 insertions(+), 16 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index a920694ec..be206680a 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -15,8 +15,8 @@ import copy as copy import itertools as it import os -from typing import Any import warnings +from typing import Any import numpy as np import yaml diff --git a/spras/prm.py b/spras/prm.py index 6ee65cf0c..6bf930ec2 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,7 +1,8 @@ import os from abc import ABC, abstractmethod -from typing import Any, Generic, TypeVar, cast, get_args from pathlib import Path +from typing import Any, Generic, Optional, TypeVar, cast, get_args + from pydantic import BaseModel from spras.config.container_schema import ProcessedContainerSettings @@ -82,15 +83,16 @@ def validate_required_inputs(cls, filename_map: dict[str, str]): for input_type in cls.required_inputs: if input_type not in filename_map: raise ValueError("{input_type} filename is missing") - + @classmethod - def validate_required_run_args(cls, inputs: dict[str, str | os.PathLike], relax: list[str] = []): + def validate_required_run_args(cls, inputs: dict[str, str | os.PathLike], relax: Optional[list[str]] = None): """ Validates the `inputs` parameter for `PRM#run`. @param inputs: See `PRM#run`. @param relax: List of inputs that aren't required: if they are specified, they should be valid path """ + if not relax: relax = [] # Check that `relax` is a valid list for entry in relax: @@ -104,11 +106,11 @@ def validate_required_run_args(cls, inputs: dict[str, str | os.PathLike], relax: if input_type in relax: continue raise ValueError(f'Required input "{input_type}" is not set') - + path = Path(inputs[input_type]) if not path.exists(): raise OSError(f'Required input "{input_type}" is pointing to a missing file "{path}".') - + # Then, check that all inputs are required inputs (to prevent typos / catch errors when inputs are updated) for input_type in inputs.keys(): if input_type not in cls.required_inputs: diff --git a/spras/responsenet.py b/spras/responsenet.py index 328e83ebb..d84af9bf6 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -2,7 +2,6 @@ from pydantic import BaseModel, ConfigDict -from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, diff --git a/spras/rwr.py b/spras/rwr.py index 79882d5e4..cfa30b73d 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -5,7 +5,6 @@ from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings -from spras.containers import prepare_volume, run_container from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index 4bf102f2f..e596d4315 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -4,13 +4,13 @@ import pytest import spras.config.config as config +from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings from spras.domino import ( DOMINO, DominoParams, post_domino_id_transform, pre_domino_id_transform, ) -from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings config.init_from_file("config/config.yaml") diff --git a/test/MEO/test_meo.py b/test/MEO/test_meo.py index b86d19d0c..7322bf003 100644 --- a/test/MEO/test_meo.py +++ b/test/MEO/test_meo.py @@ -4,8 +4,8 @@ import pytest import spras.config.config as config -from spras.meo import MEO, MEOParams, write_properties from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings +from spras.meo import MEO, MEOParams, write_properties config.init_from_file("config/config.yaml") diff --git a/test/MinCostFlow/test_mcf.py b/test/MinCostFlow/test_mcf.py index d118ab379..23e597175 100644 --- a/test/MinCostFlow/test_mcf.py +++ b/test/MinCostFlow/test_mcf.py @@ -4,8 +4,8 @@ import pytest import spras.config.config as config -from spras.mincostflow import MinCostFlow, MinCostFlowParams from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings +from spras.mincostflow import MinCostFlow, MinCostFlowParams config.init_from_file("config/config.yaml") diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index 5e7be6421..49e50148f 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -4,8 +4,8 @@ import pytest import spras.config.config as config -from spras.omicsintegrator1 import OmicsIntegrator1, OmicsIntegrator1Params, write_conf from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings +from spras.omicsintegrator1 import OmicsIntegrator1, OmicsIntegrator1Params, write_conf config.init_from_file("config/config.yaml") diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index b6bf6adc4..3638587e8 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -4,8 +4,8 @@ import pytest import spras.config.config as config -from spras.omicsintegrator2 import OmicsIntegrator2, OmicsIntegrator2Params, DummyMode from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings +from spras.omicsintegrator2 import DummyMode, OmicsIntegrator2, OmicsIntegrator2Params config.init_from_file("config/config.yaml") diff --git a/test/PathLinker/test_pathlinker.py b/test/PathLinker/test_pathlinker.py index ddc004046..e2d3e6fae 100644 --- a/test/PathLinker/test_pathlinker.py +++ b/test/PathLinker/test_pathlinker.py @@ -4,8 +4,8 @@ import pytest import spras.config.config as config -from spras.pathlinker import PathLinker, PathLinkerParams from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings +from spras.pathlinker import PathLinker, PathLinkerParams config.init_from_file("config/config.yaml") diff --git a/test/RWR/test_RWR.py b/test/RWR/test_RWR.py index faf11272c..fa3ca96f1 100644 --- a/test/RWR/test_RWR.py +++ b/test/RWR/test_RWR.py @@ -5,8 +5,8 @@ import pytest import spras.config.config as config -from spras.rwr import RWR, RWRParams from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings +from spras.rwr import RWR, RWRParams config.init_from_file("config/config.yaml") diff --git a/test/ST_RWR/test_STRWR.py b/test/ST_RWR/test_STRWR.py index 29e3090c0..be5a7c20c 100644 --- a/test/ST_RWR/test_STRWR.py +++ b/test/ST_RWR/test_STRWR.py @@ -5,8 +5,8 @@ import pytest import spras.config.config as config -from spras.strwr import ST_RWR, ST_RWRParams from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings +from spras.strwr import ST_RWR, ST_RWRParams config.init_from_file("config/config.yaml") From dbe69715bd813e86c2ad4ca62ac2b01a53cadcfd Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 29 Oct 2025 06:11:30 +0000 Subject: [PATCH 39/68] test: correct other mishaps --- spras/responsenet.py | 5 ++++- test/BowTieBuilder/test_btb.py | 6 +++--- test/ResponseNet/test_rn.py | 26 +++++++++++++------------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/spras/responsenet.py b/spras/responsenet.py index d84af9bf6..92e85245b 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -2,6 +2,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, @@ -64,8 +65,10 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, args, container_settings): + def run(inputs, output_file, args=None, container_settings=None): + if not container_settings: container_settings = ProcessedContainerSettings() ResponseNet.validate_required_run_args(inputs) + if not args: args = ResponseNetParams() # the data files will be mapped within this directory within the container work_dir = '/ResponseNet' diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index 0f87e286a..1d3fee8e1 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -18,17 +18,17 @@ class TestBowTieBuilder: Run the BowTieBuilder algorithm with missing arguments """ def test_btb_missing(self): - with pytest.raises(OSError): + with pytest.raises(ValueError): # No edges BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), "sources": Path(TEST_DIR, 'input', 'source.txt')}, output_file=OUT_FILE_DEFAULT) - with pytest.raises(OSError): + with pytest.raises(ValueError): # No source BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), "edges": Path(TEST_DIR, 'input', 'edges.txt')}, output_file=OUT_FILE_DEFAULT) - with pytest.raises(OSError): + with pytest.raises(ValueError): # No target BTB.run({"sources": Path(TEST_DIR, 'input', 'source.txt'), "edges": Path(TEST_DIR, 'input', 'edges.txt')}, diff --git a/test/ResponseNet/test_rn.py b/test/ResponseNet/test_rn.py index 2a3abccca..9d3c1cbaa 100644 --- a/test/ResponseNet/test_rn.py +++ b/test/ResponseNet/test_rn.py @@ -6,7 +6,7 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.responsenet import ResponseNet +from spras.responsenet import ResponseNet, ResponseNetParams config.init_from_file("config/config.yaml") @@ -21,9 +21,9 @@ class TestResponseNet: def test_responsenet_required(self): OUT_FILE.unlink(missing_ok=True) - ResponseNet.run(sources=TEST_DIR / 'input' / 'rn-sources.txt', - targets=TEST_DIR / 'input' / 'rn-targets.txt', - edges=TEST_DIR / 'input' / 'rn-edges.txt', + ResponseNet.run({"sources": TEST_DIR / 'input' / 'rn-sources.txt', + "targets": TEST_DIR / 'input' / 'rn-targets.txt', + "edges": TEST_DIR / 'input' / 'rn-edges.txt'}, output_file=OUT_FILE) assert OUT_FILE.exists() @@ -32,11 +32,11 @@ def test_responsenet_required(self): def test_responsenet_all_optional(self): OUT_FILE.unlink(missing_ok=True) # Include all optional arguments - ResponseNet.run(sources=TEST_DIR / 'input' / 'rn-sources.txt', - targets=TEST_DIR / 'input' / 'rn-targets.txt', - edges=TEST_DIR / 'input' / 'rn-edges.txt', + ResponseNet.run({"sources": TEST_DIR / 'input' / 'rn-sources.txt', + "targets": TEST_DIR / 'input' / 'rn-targets.txt', + "edges": TEST_DIR / 'input' / 'rn-edges.txt'}, output_file=OUT_FILE, - gamma=1) + args=ResponseNetParams(gamma=1)) assert OUT_FILE.exists() assert filecmp.cmp(OUT_FILE, EXPECTED_FILE_OPTIONAL, shallow=True) @@ -44,8 +44,8 @@ def test_responsenet_all_optional(self): def test_mincostflow_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): - ResponseNet.run(sources=TEST_DIR / 'input' / 'rn-sources.txt', - targets=TEST_DIR / 'input' / 'rn-targets.txt', + ResponseNet.run({"sources": TEST_DIR / 'input' / 'rn-sources.txt', + "targets": TEST_DIR / 'input' / 'rn-targets.txt'}, output_file=OUT_FILE) # Only run Singularity test if the binary is available on the system @@ -54,9 +54,9 @@ def test_mincostflow_missing(self): def test_responsenet_singularity(self): OUT_FILE.unlink(missing_ok=True) - ResponseNet.run(sources=TEST_DIR / 'input' / 'rn-sources.txt', - targets=TEST_DIR / 'input' / 'rn-targets.txt', - edges=TEST_DIR / 'input' / 'rn-edges.txt', + ResponseNet.run({"sources": TEST_DIR / 'input' / 'rn-sources.txt', + "targets": TEST_DIR / 'input' / 'rn-targets.txt', + "edges": TEST_DIR / 'input' / 'rn-edges.txt'}, output_file=OUT_FILE, container_settings=ProcessedContainerSettings(framework=ContainerFramework.singularity)) assert OUT_FILE.exists() From 647907a8a482e278cdfa76b9825d422dab03910b Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 28 Oct 2025 23:19:58 -0700 Subject: [PATCH 40/68] fix(mcf): correct types --- config/config.yaml | 2 +- spras/mincostflow.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index fcf27b982..f2899fb9a 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -97,7 +97,7 @@ algorithms: include: true runs: run1: - flow: 1 # The flow must be an int + flow: 1 capacity: 1 - name: "allpairs" diff --git a/spras/mincostflow.py b/spras/mincostflow.py index fc8582c97..fa65559d1 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -15,10 +15,10 @@ __all__ = ['MinCostFlow', 'MinCostFlowParams'] class MinCostFlowParams(BaseModel): - flow: Optional[float] = None + flow: Optional[int] = None "amount of flow going through the graph" - capacity: Optional[float] = None + capacity: Optional[int] = None "amount of capacity allowed on each edge" model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) From 25a079653daad4a6a93aa8d3d94efe29c9462f0d Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 29 Oct 2025 06:23:27 +0000 Subject: [PATCH 41/68] fix(oi1): correct dummy node handling --- spras/omicsintegrator1.py | 2 +- test/BowTieBuilder/test_btb.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 9f4e3a37d..be2ea1a18 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -164,7 +164,7 @@ def run(inputs, output_file, args, container_settings=None): # add dummy node file to the volume if dummy_mode is not None and it is 'file' dummy_file = None if args.dummy_mode == 'file': - if inputs["dummy_nodes"] is None: + if "dummy_nodes" not in inputs: raise ValueError("dummy_nodes file is required when dummy_mode is set to 'file'") bind_path, dummy_file = prepare_volume(inputs["dummy_nodes"], work_dir, container_settings) volumes.append(bind_path) diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index 1d3fee8e1..18279ddae 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -20,18 +20,18 @@ class TestBowTieBuilder: def test_btb_missing(self): with pytest.raises(ValueError): # No edges - BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), - "sources": Path(TEST_DIR, 'input', 'source.txt')}, + BTB.run({"targets": Path(TEST_DIR, 'input', 'target1.txt'), + "sources": Path(TEST_DIR, 'input', 'source1.txt')}, output_file=OUT_FILE_DEFAULT) with pytest.raises(ValueError): # No source - BTB.run({"targets": Path(TEST_DIR, 'input', 'target.txt'), - "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + BTB.run({"targets": Path(TEST_DIR, 'input', 'target1.txt'), + "edges": Path(TEST_DIR, 'input', 'edges1.txt')}, output_file=OUT_FILE_DEFAULT) with pytest.raises(ValueError): # No target - BTB.run({"sources": Path(TEST_DIR, 'input', 'source.txt'), - "edges": Path(TEST_DIR, 'input', 'edges.txt')}, + BTB.run({"sources": Path(TEST_DIR, 'input', 'source1.txt'), + "edges": Path(TEST_DIR, 'input', 'edges1.txt')}, output_file=OUT_FILE_DEFAULT) From 554e7aeae5852142e3b0f6cdaafc92710e1ad8f6 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 29 Oct 2025 06:52:13 +0000 Subject: [PATCH 42/68] test: correct misc artifacts --- spras/omicsintegrator1.py | 25 +++++++++++++++---------- test/AllPairs/test_ap.py | 2 -- test/OmicsIntegrator1/test_oi1.py | 8 ++++---- test/analysis/input/egfr.yaml | 20 ++++++++++---------- 4 files changed, 29 insertions(+), 26 deletions(-) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index be2ea1a18..6994c68ed 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -4,12 +4,13 @@ from pydantic import BaseModel, ConfigDict from spras.config.container_schema import ProcessedContainerSettings +from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log from spras.interactome import reinsert_direction_col_mixed from spras.prm import PRM from spras.util import add_rank_column, duplicate_edges, raw_pathway_df -__all__ = ['OmicsIntegrator1', 'OmicsIntegrator1Params', 'write_conf'] +__all__ = ['DummyMode', 'OmicsIntegrator1', 'OmicsIntegrator1Params', 'write_conf'] # TODO decide on default number of processes and threads @@ -39,8 +40,18 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi f.write('processes = 1\n') f.write('threads = 1\n') +class DummyMode(CaseInsensitiveEnum): + terminals = 'terminals' + "connect the dummy node to all nodes that have been assigned prizes" + all = 'all' + "connect the dummy node to all nodes in the interactome i.e. full set of nodes in graph" + others = 'others' + "connect the dummy node to all nodes that are not terminal nodes i.e. nodes w/o prizes" + file = 'file' + "connect the dummy node to a specific list of nodes provided in a file" + class OmicsIntegrator1Params(BaseModel): - dummy_mode: Optional[str] = None + dummy_mode: Optional[DummyMode] = None mu_squared: bool = False exclude_terms: bool = False @@ -155,15 +166,9 @@ def run(inputs, output_file, args, container_settings=None): bind_path, prize_file = prepare_volume(inputs["prizes"], work_dir, container_settings) volumes.append(bind_path) - # 4 dummy mode possibilities: - # 1. terminals -> connect the dummy node to all nodes that have been assigned prizes - # 2. all -> connect the dummy node to all nodes in the interactome i.e. full set of nodes in graph - # 3. others -> connect the dummy node to all nodes that are not terminal nodes i.e. nodes w/o prizes - # 4. file -> connect the dummy node to a specific list of nodes provided in a file - # add dummy node file to the volume if dummy_mode is not None and it is 'file' dummy_file = None - if args.dummy_mode == 'file': + if args.dummy_mode == DummyMode.file: if "dummy_nodes" not in inputs: raise ValueError("dummy_nodes file is required when dummy_mode is set to 'file'") bind_path, dummy_file = prepare_volume(inputs["dummy_nodes"], work_dir, container_settings) @@ -198,7 +203,7 @@ def run(inputs, output_file, args, container_settings=None): command.extend(['--dummyMode', dummy_file]) # else pass in the dummy_mode and let oi1 handle it else: - command.extend(['--dummyMode', args.dummy_mode]) + command.extend(['--dummyMode', str(args.dummy_mode)]) # Add optional arguments if args.mu_squared: diff --git a/test/AllPairs/test_ap.py b/test/AllPairs/test_ap.py index f5816a3ca..e9e8a8e37 100644 --- a/test/AllPairs/test_ap.py +++ b/test/AllPairs/test_ap.py @@ -78,8 +78,6 @@ def test_allpairs_singularity(self): def test_allpairs_singularity_unpacked(self): out_path = OUT_DIR / 'sample-out-unpack.txt' out_path.unlink(missing_ok=True) - # Indicate via config mechanism that we want to unpack the Singularity container - config.config.unpack_singularity = True AllPairs.run({"nodetypes": str(TEST_DIR / 'input/sample-in-nodetypes.txt'), "network": str(TEST_DIR / 'input/sample-in-net.txt'), "directed_flag": str(TEST_DIR / 'input' / 'directed-flag-false.txt')}, diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index 49e50148f..bfa14432c 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -5,7 +5,7 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.omicsintegrator1 import OmicsIntegrator1, OmicsIntegrator1Params, write_conf +from spras.omicsintegrator1 import DummyMode, OmicsIntegrator1, OmicsIntegrator1Params, write_conf config.init_from_file("config/config.yaml") @@ -45,7 +45,7 @@ def test_oi1_all_optional(self): "prizes": TEST_DIR+'input/oi1-prizes.txt'}, output_file=OUT_FILE, args=OmicsIntegrator1Params( - dummy_mode='terminals', + dummy_mode=DummyMode.terminals, mu_squared=True, exclude_terms=True, noisy_edges=0, @@ -70,7 +70,7 @@ def test_oi1_dummy_file(self): "dummy_nodes": TEST_DIR + 'input/oi1-dummy.txt'}, output_file=OUT_FILE, args=OmicsIntegrator1Params( - dummy_mode='file', + dummy_mode=DummyMode.file, w=5, b=1, d=10, @@ -106,7 +106,7 @@ def test_oi1_missing_dummy(self): w=5, b=1, d=10, - dummy_mode='file')) + dummy_mode=DummyMode.file)) # Only run Singularity test if the binary is available on the system # spython is only available on Unix, but do not explicitly skip non-Unix platforms diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml index d26bded2d..43906c434 100644 --- a/test/analysis/input/egfr.yaml +++ b/test/analysis/input/egfr.yaml @@ -29,15 +29,15 @@ containers: algorithms: - name: pathlinker - params: - include: true + include: true + runs: run1: k: - 10 - 20 - name: omicsintegrator1 - params: - include: true + include: true + runs: run1: b: - 0.55 @@ -55,8 +55,8 @@ algorithms: - 0.008 dummy_mode: ["file"] - name: omicsintegrator2 - params: - include: true + include: true + runs: run1: b: - 4 @@ -68,8 +68,8 @@ algorithms: g: - 3 - name: meo - params: - include: true + include: true + runs: run1: local_search: - "Yes" @@ -78,8 +78,8 @@ algorithms: rand_restarts: - 10 - name: domino - params: - include: true + include: true + runs: run1: slice_threshold: - 0.3 From 105b72de77e8cbb57e633a9bea46015895ed31b9 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 29 Oct 2025 07:03:27 +0000 Subject: [PATCH 43/68] fix: allow yaml safe dump of case insensitive enums --- spras/config/util.py | 6 ++++++ test/OmicsIntegrator1/test_oi1.py | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/spras/config/util.py b/spras/config/util.py index 73b4dbeaf..6bde66319 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -6,6 +6,7 @@ from enum import Enum from typing import Any +import yaml from pydantic import BaseModel, ConfigDict @@ -26,6 +27,11 @@ def _missing_(cls, value: Any): return member return None +# We also need to allow `CaseInsensitiveEnum` to be represented in yaml.safe_dump: +yaml.SafeDumper.add_multi_representer( + CaseInsensitiveEnum, + yaml.representer.SafeRepresenter.represent_str, +) class Empty(BaseModel): """ diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index bfa14432c..c0fa19914 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -5,7 +5,12 @@ import spras.config.config as config from spras.config.container_schema import ContainerFramework, ProcessedContainerSettings -from spras.omicsintegrator1 import DummyMode, OmicsIntegrator1, OmicsIntegrator1Params, write_conf +from spras.omicsintegrator1 import ( + DummyMode, + OmicsIntegrator1, + OmicsIntegrator1Params, + write_conf, +) config.init_from_file("config/config.yaml") From eaecd33ea0eb8b033020e4e130fe09fe8295d7ce Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 29 Oct 2025 07:03:47 +0000 Subject: [PATCH 44/68] docs: give credit to prev. commit --- spras/config/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spras/config/util.py b/spras/config/util.py index 6bde66319..efd9d925d 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -28,6 +28,7 @@ def _missing_(cls, value: Any): return None # We also need to allow `CaseInsensitiveEnum` to be represented in yaml.safe_dump: +# https://github.com/yaml/pyyaml/issues/722#issue-1781352490 yaml.SafeDumper.add_multi_representer( CaseInsensitiveEnum, yaml.representer.SafeRepresenter.represent_str, From 6933fe400305c06bcb8fb519509c2ce919a006a8 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 29 Oct 2025 07:20:01 +0000 Subject: [PATCH 45/68] fix: correctly write enum --- spras/config/util.py | 2 +- spras/omicsintegrator1.py | 4 ++-- spras/omicsintegrator2.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spras/config/util.py b/spras/config/util.py index efd9d925d..17a1797db 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -6,8 +6,8 @@ from enum import Enum from typing import Any -import yaml +import yaml from pydantic import BaseModel, ConfigDict diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 6994c68ed..8757ab4fb 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -197,13 +197,13 @@ def run(inputs, output_file, args, container_settings=None): '--outlabel', 'oi1'] # add the dummy mode argument - if args.dummy_mode is not None and args.dummy_mode: + if args.dummy_mode is not None: # for custom dummy modes, add the file if dummy_file: command.extend(['--dummyMode', dummy_file]) # else pass in the dummy_mode and let oi1 handle it else: - command.extend(['--dummyMode', str(args.dummy_mode)]) + command.extend(['--dummyMode', args.dummy_mode.value]) # Add optional arguments if args.mu_squared: diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 984a1d4de..a8f5e7c27 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -147,7 +147,7 @@ def run(inputs, output_file, args=None, container_settings=None): command.extend(['--random_terminals', str(args.random_terminals)]) if args.dummy_mode is not None: # This argument does not follow the other naming conventions - command.extend(['--dummyMode', str(args.dummy_mode)]) + command.extend(['--dummyMode', args.dummy_mode.value]) if args.seed is not None: command.extend(['--seed', str(args.seed)]) From 8fa7ca5d949f3eb1efb2a4d68945e928a9810afc Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 1 Nov 2025 09:52:37 +0000 Subject: [PATCH 46/68] test(domino): don't unlink file before reading bad copy & paste --- test/DOMINO/test_domino.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index e596d4315..2f88e4c03 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -35,7 +35,6 @@ def test_domino_required(self): DOMINO.run({"network": TEST_DIR / 'input' / 'simple' / 'domino-network.txt', "active_genes": TEST_DIR / 'input' / 'simple' / 'domino-active-genes.txt'}, output_file=OUT_FILE_DEFAULT) - OUT_FILE_DEFAULT.unlink(missing_ok=True) # output_file should be empty assert OUT_FILE_DEFAULT.exists() From f321f7960486044870280f063061f6d7690bc32d Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 1 Nov 2025 10:07:45 +0000 Subject: [PATCH 47/68] fix: correct other alg parameters, summary test update --- spras/analysis/summary.py | 1 + spras/meo.py | 2 +- spras/omicsintegrator1.py | 9 +++++-- spras/omicsintegrator2.py | 4 ++-- .../expected_output/expected_egfr_summary.txt | 18 +++++++------- .../expected_example_summary.txt | 24 +++++++++---------- 6 files changed, 32 insertions(+), 26 deletions(-) diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index c8abc1cad..d147982e8 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -98,6 +98,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg # Algorithm parameters have format { algo : { hashcode : { parameter combos } } } param_combo = algo_params[algo][hashcode] + # TODO: sort parameters to provide stable summary table output cur_nw_info.append(param_combo) # Save the current network information to the network summary list diff --git a/spras/meo.py b/spras/meo.py index 8f97289dc..980d7c220 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -82,7 +82,7 @@ class MEOParams(BaseModel): """ rand_restarts: Optional[int] = None - "The number of random restarts to do." + "The number of random restarts to use." model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 8757ab4fb..1a20e1558 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -50,6 +50,11 @@ class DummyMode(CaseInsensitiveEnum): file = 'file' "connect the dummy node to a specific list of nodes provided in a file" + # To make sure that DummyMode prints as `terminals`, etc.. in JSON dictionaries + # (since they use object representation internally.) + def __repr__(self) -> str: + return f"'{self.name}'" + class OmicsIntegrator1Params(BaseModel): dummy_mode: Optional[DummyMode] = None mu_squared: bool = False @@ -67,7 +72,7 @@ class OmicsIntegrator1Params(BaseModel): seed: Optional[int] = None "The randomness seed to use." - w: int + w: float "Float that affects the number of connected components, with higher values leading to more components" b: float @@ -83,7 +88,7 @@ class OmicsIntegrator1Params(BaseModel): "Standard Deviation of the gaussian noise added to edges in Noisy Edges Randomizations" g: float = 0.001 - "msgsteiner reinforcement parameter that affects the convergence of the solution and runtime, with larger values leading to faster convergence but suboptimal results." + "(gamma) msgsteiner reinforcement parameter that affects the convergence of the solution and runtime, with larger values leading to faster convergence but suboptimal results." r: float = 0 "msgsteiner parameter that adds random noise to edges, which is rarely needed." diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index a8f5e7c27..5a869eaed 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -20,13 +20,13 @@ class DummyMode(CaseInsensitiveEnum): all = 'all' class OmicsIntegrator2Params(BaseModel): - w: float = 6 + w: float = 5 "Omega: the weight of the edges connecting the dummy node to the nodes selected by dummyMode" b: float = 1 "Beta: scaling factor of prizes" - g: float = 20 + g: float = 3 "Gamma: multiplicative edge penalty from degree of endpoints" noise: Optional[float] = None diff --git a/test/analysis/expected_output/expected_egfr_summary.txt b/test/analysis/expected_output/expected_egfr_summary.txt index 0b4fe9ebd..c087decdb 100644 --- a/test/analysis/expected_output/expected_egfr_summary.txt +++ b/test/analysis/expected_output/expected_egfr_summary.txt @@ -1,10 +1,10 @@ Name Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in sources Nodes in targets Nodes in active Nodes in dummy Parameter combination -test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt 48 45 3 0.0398936170212766 5 2.0 16 3.882808476926124 27 0 27 27 0 {'slice_threshold': 0.3, 'module_threshold': 0.05} -test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt 1877 12845 1 0.007295700506524384 469 6.0 6 2.7973618474338107 621 1 620 621 1 {'local_search': 'Yes', 'max_path_length': 3, 'rand_restarts': 10} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt 28 20 8 0.05291005291005291 4 1.0 5 1.306439393939394 28 1 27 28 1 {'b': 2, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt 39 31 8 0.04183535762483131 6 1.0 5 1.5084498834498834 39 1 38 39 1 {'b': 10, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt 14 9 5 0.0989010989010989 4 1.0 2 1.1866666666666668 14 0 14 14 0 {'b': 0.55, 'd': 10, 'g': '1e-3', 'r': 0.01, 'w': 0.1, 'mu': 0.008, 'dummy_mode': 'file'} -test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt 593 591 2 0.0033669841848593955 32 1.0 30 6.72248989073389 531 1 530 531 1 {'b': 2, 'g': 3} -test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt 704 702 2 0.002836867968446916 35 1.0 24 6.038766691954387 616 1 615 616 1 {'b': 4, 'g': 0} -test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt 14 17 1 0.18681318681318682 6 2.0 7 2.857142857142857 6 1 5 6 1 {'k': 10} -test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt 25 32 1 0.10666666666666667 8 2.0 7 3.486666666666667 11 1 10 11 1 {'k': 20} +test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt 48 45 3 0.0398936170212766 5 2.0 16 3.882808476926124 27 0 27 27 0 {'module_threshold': 0.05, 'slice_threshold': 0.3, '_spras_run_name': 'run1'} +test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt 1877 12845 1 0.007295700506524384 469 6.0 6 2.7973618474338107 621 1 620 621 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10, '_spras_run_name': 'run1'} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt 28 20 8 0.05291005291005291 4 1.0 5 1.306439393939394 28 1 27 28 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 10.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01, '_spras_run_name': 'run1'} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt 39 31 8 0.04183535762483131 6 1.0 5 1.5084498834498834 39 1 38 39 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 2.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01, '_spras_run_name': 'run1'} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt 14 9 5 0.0989010989010989 4 1.0 2 1.1866666666666668 14 0 14 14 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 0.55, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01, '_spras_run_name': 'run1'} +test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt 593 591 2 0.0033669841848593955 32 1.0 30 6.72248989073389 531 1 530 531 1 {'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None, '_spras_run_name': 'run1'} +test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt 704 702 2 0.002836867968446916 35 1.0 24 6.038766691954387 616 1 615 616 1 {'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None, '_spras_run_name': 'run2'} +test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt 14 17 1 0.18681318681318682 6 2.0 7 2.857142857142857 6 1 5 6 1 {'k': 10, '_spras_run_name': 'run1'} +test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt 25 32 1 0.10666666666666667 8 2.0 7 3.486666666666667 11 1 10 11 1 {'k': 20, '_spras_run_name': 'run1'} diff --git a/test/analysis/expected_output/expected_example_summary.txt b/test/analysis/expected_output/expected_example_summary.txt index 4cb5b8c8f..110c4abf7 100644 --- a/test/analysis/expected_output/expected_example_summary.txt +++ b/test/analysis/expected_output/expected_example_summary.txt @@ -1,13 +1,13 @@ Name Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in active Nodes in dummy Nodes in sources Nodes in targets Parameter combination -test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'spras_placeholder': 'no parameters'} -test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'slice_threshold': 0.3, 'module_threshold': 0.05} -test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'max_path_length': 3, 'local_search': 'Yes', 'rand_restarts': 10} -test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'flow': 1, 'capacity': 1} -test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'b': 6, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'b': 6, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'b': 5, 'w': 0.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'b': 5, 'w': 5.0, 'd': 10, 'dummy_mode': 'file'} -test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'b': 2, 'g': 3} -test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'b': 4, 'g': 0} -test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 200} -test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 100} +test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'_spras_run_name': 'default'} +test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'module_threshold': 0.05, 'slice_threshold': 0.3, '_spras_run_name': 'run1'} +test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10, '_spras_run_name': 'run1'} +test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'flow': 1, 'capacity': 1, '_spras_run_name': 'run1'} +test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0, '_spras_run_name': 'run1'} +test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0, '_spras_run_name': 'run1'} +test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0, '_spras_run_name': 'run1'} +test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0, '_spras_run_name': 'run1'} +test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None, '_spras_run_name': 'run1'} +test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None, '_spras_run_name': 'run2'} +test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 200, '_spras_run_name': 'run1'} +test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 100, '_spras_run_name': 'run1'} From ef02ac5c0cd9a40302b1d319fd597e881ffdf2bf Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 1 Nov 2025 10:09:01 +0000 Subject: [PATCH 48/68] fix(summary): drop _spras_run_name --- spras/analysis/summary.py | 1 + .../expected_output/expected_egfr_summary.txt | 18 +++++++------- .../expected_example_summary.txt | 24 +++++++++---------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index d147982e8..2092200f5 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -98,6 +98,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg # Algorithm parameters have format { algo : { hashcode : { parameter combos } } } param_combo = algo_params[algo][hashcode] + del param_combo['_spras_run_name'] # TODO: sort parameters to provide stable summary table output cur_nw_info.append(param_combo) diff --git a/test/analysis/expected_output/expected_egfr_summary.txt b/test/analysis/expected_output/expected_egfr_summary.txt index c087decdb..a2d151a4a 100644 --- a/test/analysis/expected_output/expected_egfr_summary.txt +++ b/test/analysis/expected_output/expected_egfr_summary.txt @@ -1,10 +1,10 @@ Name Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in sources Nodes in targets Nodes in active Nodes in dummy Parameter combination -test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt 48 45 3 0.0398936170212766 5 2.0 16 3.882808476926124 27 0 27 27 0 {'module_threshold': 0.05, 'slice_threshold': 0.3, '_spras_run_name': 'run1'} -test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt 1877 12845 1 0.007295700506524384 469 6.0 6 2.7973618474338107 621 1 620 621 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10, '_spras_run_name': 'run1'} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt 28 20 8 0.05291005291005291 4 1.0 5 1.306439393939394 28 1 27 28 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 10.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01, '_spras_run_name': 'run1'} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt 39 31 8 0.04183535762483131 6 1.0 5 1.5084498834498834 39 1 38 39 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 2.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01, '_spras_run_name': 'run1'} -test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt 14 9 5 0.0989010989010989 4 1.0 2 1.1866666666666668 14 0 14 14 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 0.55, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01, '_spras_run_name': 'run1'} -test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt 593 591 2 0.0033669841848593955 32 1.0 30 6.72248989073389 531 1 530 531 1 {'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None, '_spras_run_name': 'run1'} -test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt 704 702 2 0.002836867968446916 35 1.0 24 6.038766691954387 616 1 615 616 1 {'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None, '_spras_run_name': 'run2'} -test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt 14 17 1 0.18681318681318682 6 2.0 7 2.857142857142857 6 1 5 6 1 {'k': 10, '_spras_run_name': 'run1'} -test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt 25 32 1 0.10666666666666667 8 2.0 7 3.486666666666667 11 1 10 11 1 {'k': 20, '_spras_run_name': 'run1'} +test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt 48 45 3 0.0398936170212766 5 2.0 16 3.882808476926124 27 0 27 27 0 {'module_threshold': 0.05, 'slice_threshold': 0.3} +test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt 1877 12845 1 0.007295700506524384 469 6.0 6 2.7973618474338107 621 1 620 621 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt 28 20 8 0.05291005291005291 4 1.0 5 1.306439393939394 28 1 27 28 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 10.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt 39 31 8 0.04183535762483131 6 1.0 5 1.5084498834498834 39 1 38 39 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 2.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01} +test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt 14 9 5 0.0989010989010989 4 1.0 2 1.1866666666666668 14 0 14 14 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 0.55, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01} +test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt 593 591 2 0.0033669841848593955 32 1.0 30 6.72248989073389 531 1 530 531 1 {'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None} +test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt 704 702 2 0.002836867968446916 35 1.0 24 6.038766691954387 616 1 615 616 1 {'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None} +test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt 14 17 1 0.18681318681318682 6 2.0 7 2.857142857142857 6 1 5 6 1 {'k': 10} +test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt 25 32 1 0.10666666666666667 8 2.0 7 3.486666666666667 11 1 10 11 1 {'k': 20} diff --git a/test/analysis/expected_output/expected_example_summary.txt b/test/analysis/expected_output/expected_example_summary.txt index 110c4abf7..78fe74d78 100644 --- a/test/analysis/expected_output/expected_example_summary.txt +++ b/test/analysis/expected_output/expected_example_summary.txt @@ -1,13 +1,13 @@ Name Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in active Nodes in dummy Nodes in sources Nodes in targets Parameter combination -test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'_spras_run_name': 'default'} -test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'module_threshold': 0.05, 'slice_threshold': 0.3, '_spras_run_name': 'run1'} -test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10, '_spras_run_name': 'run1'} -test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'flow': 1, 'capacity': 1, '_spras_run_name': 'run1'} -test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0, '_spras_run_name': 'run1'} -test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0, '_spras_run_name': 'run1'} -test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0, '_spras_run_name': 'run1'} -test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0, '_spras_run_name': 'run1'} -test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None, '_spras_run_name': 'run1'} -test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None, '_spras_run_name': 'run2'} -test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 200, '_spras_run_name': 'run1'} -test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 100, '_spras_run_name': 'run1'} +test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {} +test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'module_threshold': 0.05, 'slice_threshold': 0.3} +test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10} +test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'flow': 1, 'capacity': 1} +test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0} +test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0} +test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0} +test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0} +test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None} +test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None} +test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 200} +test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 100} From c3be5502889fe73155fc1141c600f2beb3392e62 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 8 Nov 2025 13:29:50 -0800 Subject: [PATCH 49/68] docs: update contributing --- docs/contributing/index.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index 0d8a542ff..46f13869c 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -181,6 +181,8 @@ wrapper functions for the Local Neighborhood algorithm. Use Call the new class within ``local_neighborhood.py`` ``LocalNeighborhood`` and set ``__all__`` so the class can be `imported `__. +Make sure to specify the type of parameters passed in to ``LocalNeighborhood`` as ``Empty`` +(see ``AllPairs`` for an example of this.) Specify the list of ``required_input`` files to be ``network`` and ``nodes``, and set the ``dois`` property to be an empty list. These entries are used to tell Snakemake what input files should be present @@ -242,7 +244,9 @@ the format ``|``, which also differs from the ``omicsintegrator1.py`` example. ``spras/dataset.py`` provides functions that provide access to node information and the interactome (edge list). -Implement the ``run`` function, following the PathLinker example. The +Implement the ``run`` function, following the PathLinker example. Since +there are no arguments, you do not need to instantiate a new instance of ``Empty`` +if parameters are not specified. The ``prepare_volume`` utility function is needed to prepare the network and nodes input files to be mounted and used inside the container. It is also used to prepare the path for the output file, which is different From 830b07c41e0e106294d514cf5cf55cefe288443d Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 10 Nov 2025 18:01:41 +0000 Subject: [PATCH 50/68] chore: drop directed info --- Snakefile | 1 - spras/config/config.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/Snakefile b/Snakefile index c51dc172d..cf075b0fa 100644 --- a/Snakefile +++ b/Snakefile @@ -22,7 +22,6 @@ _config.init_global(config) out_dir = _config.config.out_dir algorithm_params = _config.config.algorithm_params -algorithm_directed = _config.config.algorithm_directed pca_params = _config.config.pca_params hac_params = _config.config.hac_params container_settings = _config.config.container_settings diff --git a/spras/config/config.py b/spras/config/config.py index be206680a..527bec043 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -76,8 +76,6 @@ def __init__(self, raw_config: dict[str, Any]): # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. # Only includes algorithms that are set to be run with 'include: true'. self.algorithm_params: dict[str, dict[str, Any]] = dict() - # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs. - self.algorithm_directed = None # A dict with the analysis settings self.analysis_params = parsed_raw_config.analysis # A dict with the evaluation settings @@ -149,7 +147,6 @@ def process_algorithms(self, raw_config: RawConfig): """ prior_params_hashes = set() self.algorithm_params = dict() - self.algorithm_directed = dict() self.algorithms = raw_config.algorithms for alg in self.algorithms: if alg.include: From 3b84e8f54ea5e9b686b761e2c724adedcfb01626 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 21 Nov 2025 10:18:55 -0800 Subject: [PATCH 51/68] docs: typo? Co-authored-by: Neha Talluri <78840540+ntalluri@users.noreply.github.com> --- docs/contributing/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index 46f13869c..60d8f3eee 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -246,7 +246,7 @@ that provide access to node information and the interactome (edge list). Implement the ``run`` function, following the PathLinker example. Since there are no arguments, you do not need to instantiate a new instance of ``Empty`` -if parameters are not specified. The +if parameters are not specified. ``prepare_volume`` utility function is needed to prepare the network and nodes input files to be mounted and used inside the container. It is also used to prepare the path for the output file, which is different From 33a85d87fca8a4664c0a2d94e6a9ca4ae51e65f7 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 21 Nov 2025 18:25:27 +0000 Subject: [PATCH 52/68] chore: apply suggestions, fix meo param --- docker-wrappers/SPRAS/example_config.yaml | 2 +- docs/_static/config/intermediate.yaml | 2 +- docs/prms/meo.rst | 4 ++-- spras/pathlinker.py | 2 +- test/analysis/input/egfr.yaml | 2 +- test/generate-inputs/inputs/test_config.yaml | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml index db1c2dbbf..f6be1b8e3 100644 --- a/docker-wrappers/SPRAS/example_config.yaml +++ b/docker-wrappers/SPRAS/example_config.yaml @@ -79,7 +79,7 @@ algorithms: include: true run1: max_path_length: [3] - local_search: ["Yes"] + local_search: [true] rand_restarts: [10] - name: "mincostflow" diff --git a/docs/_static/config/intermediate.yaml b/docs/_static/config/intermediate.yaml index 1f0ba2eb5..0e0439e96 100644 --- a/docs/_static/config/intermediate.yaml +++ b/docs/_static/config/intermediate.yaml @@ -52,7 +52,7 @@ algorithms: params: include: true run1: - local_search: ["Yes", "No"] + local_search: [true, false] max_path_length: [2, 3] rand_restarts: 10 - name: allpairs diff --git a/docs/prms/meo.rst b/docs/prms/meo.rst index 55157174a..71206adf9 100644 --- a/docs/prms/meo.rst +++ b/docs/prms/meo.rst @@ -9,6 +9,6 @@ MEO takes in three optional parameters: * max_path_length: The maximal path (from any source to any target) lengths to return when orienting the graph (note: paths may contain duplicate vertices, but never duplicate edges.) -* local_search: a "Yes"/"No" parameter that enables MEO's local search functionality. See "Improving approximations with local search" in - the associated paper for more information. This should almost always be yes. +* local_search: a boolean parameter that enables MEO's local search functionality. See "Improving approximations with local search" in + the associated paper for more information. This should almost always be true. * rand_restarts: the number (int) of random restarts to use. diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 9378ad284..cf987805b 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -17,7 +17,7 @@ class PathLinkerParams(BaseModel): k: int = 100 - "path length" + "Number of paths" model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml index 43906c434..823db03bb 100644 --- a/test/analysis/input/egfr.yaml +++ b/test/analysis/input/egfr.yaml @@ -72,7 +72,7 @@ algorithms: runs: run1: local_search: - - "Yes" + - true max_path_length: - 3 rand_restarts: diff --git a/test/generate-inputs/inputs/test_config.yaml b/test/generate-inputs/inputs/test_config.yaml index 0c83017fe..5638d5cc2 100644 --- a/test/generate-inputs/inputs/test_config.yaml +++ b/test/generate-inputs/inputs/test_config.yaml @@ -36,7 +36,7 @@ algorithms: include: true run1: max_path_length: [3] - local_search: ["Yes"] + local_search: [true] rand_restarts: [10] - name: "mincostflow" From 81a16fe79ce1bfceef53e823bf4ae97fbd8e4c11 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 21 Nov 2025 18:27:55 +0000 Subject: [PATCH 53/68] docs: move domino cmt --- spras/domino.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spras/domino.py b/spras/domino.py index d574daa30..0f00a85b6 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -81,8 +81,6 @@ def run(inputs, output_file, args=None, container_settings=None): if not args: args = DominoParams() DOMINO.validate_required_run_args(inputs) - # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. - work_dir = '/spras' # Each volume is a tuple (source, destination) @@ -126,6 +124,7 @@ def run(inputs, output_file, args=None, container_settings=None): raise err # Make the Python command to run within the container + # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false. domino_command = ['domino', '--active_genes_files', node_file, '--network_file', network_file, From 25e8b670497349db933482d52438125da2b9d314 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 21 Nov 2025 18:35:40 +0000 Subject: [PATCH 54/68] docs: more suggestions --- docs/contributing/index.rst | 4 +--- spras/config/algorithms.py | 5 +++-- spras/config/config.py | 2 +- spras/config/util.py | 3 ++- spras/prm.py | 2 ++ 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index 60d8f3eee..8e21883f0 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -244,9 +244,7 @@ the format ``|``, which also differs from the ``omicsintegrator1.py`` example. ``spras/dataset.py`` provides functions that provide access to node information and the interactome (edge list). -Implement the ``run`` function, following the PathLinker example. Since -there are no arguments, you do not need to instantiate a new instance of ``Empty`` -if parameters are not specified. +Implement the ``run`` function, following the AllPairs example. ``prepare_volume`` utility function is needed to prepare the network and nodes input files to be mounted and used inside the container. It is also used to prepare the path for the output file, which is different diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 2a7620ffd..40f4029bb 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -12,6 +12,7 @@ from spras.runner import algorithms +# This contains the dynamically generated algorithm schema for use in `schema.py` __all__ = ['AlgorithmUnion'] def is_numpy_friendly(type: type[Any] | None) -> bool: @@ -78,11 +79,11 @@ def list_coerce(value: Any) -> Any: return [value] return value +# This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection +# and preserve rich type information at runtime. def construct_algorithm_model(name: str, model: type[BaseModel], model_default: Optional[BaseModel]) -> type[BaseModel]: """ Dynamically constructs a parameter-combination model based on the original args model. - This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection - and preserve rich type information at runtime. """ # First, we need to take our 'model' and coerce it to permit parameter combinations. # This assumes that all of the keys are flattened, so we only get a structure like so: diff --git a/spras/config/config.py b/spras/config/config.py index 527bec043..469df5046 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -191,7 +191,7 @@ def process_algorithms(self, raw_config: RawConfig): f'(current length {self.hash_length}).') # We preserve the run name as it carries useful information for the parameter log, - # and is useful for testing. + # and is useful for configuration testing. run_dict["_spras_run_name"] = run_name self.algorithm_params[alg.name][params_hash] = run_dict diff --git a/spras/config/util.py b/spras/config/util.py index 17a1797db..d59cd57ea 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -27,7 +27,8 @@ def _missing_(cls, value: Any): return member return None -# We also need to allow `CaseInsensitiveEnum` to be represented in yaml.safe_dump: +# We also need to allow `CaseInsensitiveEnum` to be represented in yaml.safe_dump, +# allowing us to safely log parameters in Snakemake: # https://github.com/yaml/pyyaml/issues/722#issue-1781352490 yaml.SafeDumper.add_multi_representer( CaseInsensitiveEnum, diff --git a/spras/prm.py b/spras/prm.py index 6bf930ec2..bf01b3202 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -42,6 +42,8 @@ def generate_inputs(data: Dataset, filename_map: dict[str, str]): """ raise NotImplementedError + # This is used in `runner.py` to avoid a dependency diamond when trying + # to import the actual algorithm schema. @classmethod def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: dict[str, Any], container_settings: ProcessedContainerSettings): """ From dce8e42179a9b801ad3e81de199b03a4ca214e7b Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 21 Nov 2025 18:40:04 +0000 Subject: [PATCH 55/68] docs: more cmts --- spras/config/algorithms.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 40f4029bb..627c5ffde 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -28,7 +28,8 @@ def is_numpy_friendly(type: type[Any] | None) -> bool: def python_evalish_coerce(value: Any) -> Any: """ - Allows for using numpy and python calls. + Allows for using numpy and python calls. `range`, `np.linspace`, `np.arange`, and + `np.logspace` are expanded. **Safety Note**: This does not prevent availability attacks: this can still exhaust resources if wanted. This only prevents secret leakage. @@ -84,6 +85,10 @@ def list_coerce(value: Any) -> Any: def construct_algorithm_model(name: str, model: type[BaseModel], model_default: Optional[BaseModel]) -> type[BaseModel]: """ Dynamically constructs a parameter-combination model based on the original args model. + + Parameter arguments such as `int` get turned into `list[int]`, and have extra conveniences attached: + - Values can be passed as lists (1 -> [1]) + - Ranges and other convenient calls are expanded (see `python_evalish_coerce`) """ # First, we need to take our 'model' and coerce it to permit parameter combinations. # This assumes that all of the keys are flattened, so we only get a structure like so: From 790cab96093eb6fcd58cb4c3260460a0b45f1a99 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 21 Nov 2025 19:59:38 +0000 Subject: [PATCH 56/68] docs: add comment to is_numpy_friendly --- spras/config/algorithms.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 627c5ffde..bc14fd1a7 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -18,7 +18,9 @@ def is_numpy_friendly(type: type[Any] | None) -> bool: """ Whether the passed in type can have any numpy helpers. - This is mainly used to provide hints in the JSON schema. + This is used to provide hints in the JSON schema, + and to determine whether or not to allow for easy ranges using + `python_evalish_coerce`. """ allowed_types = (int, float) From 27e6109877871e9e123a5179aac446bce3c5e3a6 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 21 Nov 2025 21:32:51 +0000 Subject: [PATCH 57/68] docs: fix cmt --- docs/contributing/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index 8e21883f0..a2c030207 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -244,7 +244,7 @@ the format ``|``, which also differs from the ``omicsintegrator1.py`` example. ``spras/dataset.py`` provides functions that provide access to node information and the interactome (edge list). -Implement the ``run`` function, following the AllPairs example. +Implement the ``run`` function, following the AllPairs example. The ``prepare_volume`` utility function is needed to prepare the network and nodes input files to be mounted and used inside the container. It is also used to prepare the path for the output file, which is different From 893b80d2635dd5de208a7aee4fe70af58b118066 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 5 Dec 2025 20:25:45 -0800 Subject: [PATCH 58/68] docs: typo Co-authored-by: Anthony Gitter --- spras/config/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index bc14fd1a7..665df572d 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -165,5 +165,5 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: ) algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model, model_default) for name, (_, model, model_default) in algorithms.items()] -# name differentriates algorithms +# name differentiates algorithms AlgorithmUnion = Annotated[Union[tuple(algorithm_models)], Field(discriminator='name')] From 630f5c7a491d700a009dded8ef693783a89c899b Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 5 Dec 2025 20:26:42 -0800 Subject: [PATCH 59/68] docs: grammar Co-authored-by: Anthony Gitter --- spras/config/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/util.py b/spras/config/util.py index d59cd57ea..f7abf96b2 100644 --- a/spras/config/util.py +++ b/spras/config/util.py @@ -38,6 +38,6 @@ def _missing_(cls, value: Any): class Empty(BaseModel): """ The empty base model. Used for specifying that an algorithm takes no parameters, - yet are deterministic. + yet is deterministic. """ model_config = ConfigDict(extra="forbid") From 3d7cc6d7e724011e2862cc0f89447a1a76d05b85 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 6 Dec 2025 04:32:25 +0000 Subject: [PATCH 60/68] docs: rephrasing, contributing update --- docs/contributing/index.rst | 16 ++++++---------- spras/config/algorithms.py | 6 +++--- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index a2c030207..17b956d80 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -117,17 +117,13 @@ Step 2: Create a Local Neighborhood Docker image Complete the ``Dockerfile`` in the ``docker-wrappers/LocalNeighborhood`` -directory to create a Docker image. The PathLinker ``Dockerfile`` +directory to create a Docker image. The AllPairs ``Dockerfile`` demonstrates how to begin with a Python image and copy files into the image with ``COPY``. Browse the official `Python images `__ to select a recent version -of Python based on Alpine Linux, a small Linux distribution. Note that -the PathLinker example uses an old version of Python, but this Local +of Python based on Alpine Linux, a small Linux distribution. This Local Neighborhood Docker image should be based on a more modern version of -Python. In addition, not all pathway reconstruction algorithms are -compatible with Alpine Linux, so the default Debian-based Python image -is required. The ``Dockerfile`` does not need an ``ENTRYPOINT`` or -``CMD`` line. It will be used to run a Python command. +Python. Build the Docker image by running @@ -176,7 +172,7 @@ Step 3: Write the Local Neighborhood wrapper functions Add a new Python file ``spras/local_neighborhood.py`` to implement the wrapper functions for the Local Neighborhood algorithm. Use -``pathlinker.py`` as an example. +``allpairs.py`` as an example. Call the new class within ``local_neighborhood.py`` ``LocalNeighborhood`` and set ``__all__`` so the class can be @@ -248,7 +244,7 @@ Implement the ``run`` function, following the AllPairs example. The ``prepare_volume`` utility function is needed to prepare the network and nodes input files to be mounted and used inside the container. It is also used to prepare the path for the output file, which is different -from how the output is prepared in the PathLinker example. The +from how the output is prepared in the AllPairs example. The functionality of ``prepare_volume`` is similar to how you had to manually specify paths relative to the container's file system when you interactive tested the container in Step 2. It is not necessary to @@ -326,7 +322,7 @@ Add test functions to the test file ``test/test_ln.py``. This file already has existing tests to test the correctness of the Local Neighborhood implementation that was added to the Docker image. The new tests will test that the ``run`` function of the ``LocalNeighborhood`` -class works correctly. Use ``test_pathlinker.py`` as an example. There +class works correctly. Use ``test_ap.py`` as an example. There are input files for testing in the ``test/LocalNeighborhood/input`` directory. The new test functions will be automatically run as part of diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index bc14fd1a7..4c79b497a 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -30,8 +30,8 @@ def is_numpy_friendly(type: type[Any] | None) -> bool: def python_evalish_coerce(value: Any) -> Any: """ - Allows for using numpy and python calls. `range`, `np.linspace`, `np.arange`, and - `np.logspace` are expanded. + Allows for using numpy and python calls: specifically, + `range`, `np.linspace`, `np.arange`, and `np.logspace` are supported. **Safety Note**: This does not prevent availability attacks: this can still exhaust resources if wanted. This only prevents secret leakage. @@ -69,7 +69,7 @@ def python_evalish_coerce(value: Any) -> Any: arguments = [ast.literal_eval(arg) for arg in value_ast.body.args] if function_name not in functions_dict: - raise ValueError(f"{function_name} is not an allowed function to be run!") + raise ValueError(f"{function_name} is not an allowed function to be run! Allowed functions: {list(functions_dict.keys())}") return functions_dict[function_name](arguments) From fe93bd08f13ade601578aec2a7577e95d61f4724 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 6 Dec 2025 04:37:13 +0000 Subject: [PATCH 61/68] docs: clearer pydantic and reflection calls --- spras/prm.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/spras/prm.py b/spras/prm.py index bf01b3202..1af47cc58 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -49,10 +49,10 @@ def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | o """ This is similar to PRA.run, but it does pydantic logic internally to re-validate argument parameters. """ - # awful reflection here, unfortunately: - # https://stackoverflow.com/a/71720366/7589775 - # alternatively, one could have a T_class parameter - # for PRA here, but this level of implicitness seems alright. + # Gets the parameter type passed into T, allowing us to use the + # underlying pydantic model associated to it. (TODO: use get_original_bases when we bump to >= Python 3.12) + # This is hacky reflection from https://stackoverflow.com/a/71720366/7589775 + # which grabs the class of type T T_class: type[T] = get_args(cast(Any, cls).__orig_bases__[0])[0] # Since we just used reflection, we provide a mountain-dewey error message here @@ -60,8 +60,9 @@ def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | o if not issubclass(T_class, BaseModel): raise RuntimeError("The generic passed into PRM is not a pydantic.BaseModel.") - # (and pydantic already provides nice error messages, so we don't need to worry about - # catching this.) + # Validates our untyped `args` parameter against our parameter class of type T + # using BaseModel.model_validate (https://docs.pydantic.dev/latest/api/base_model/#pydantic.BaseModel.model_validate) + # (Pydantic already provides nice error messages, so we don't need to worry about catching this.) T_parsed = T_class.model_validate(args) return cls.run(inputs, output_file, T_parsed, container_settings) From 5911e59b85c2a0db9d66945be630470e2073ed6f Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 6 Dec 2025 05:10:14 +0000 Subject: [PATCH 62/68] fix: pass less objects in runner.py --- docker-wrappers/SPRAS/example_config.yaml | 27 ++++++------- docs/_static/config/intermediate.yaml | 35 ++++++++-------- spras/config/algorithms.py | 13 ++++-- spras/config/config.py | 1 + spras/prm.py | 20 ++++++--- spras/runner.py | 49 +++++++++++------------ 6 files changed, 79 insertions(+), 66 deletions(-) diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml index f6be1b8e3..1e7fd69c2 100644 --- a/docker-wrappers/SPRAS/example_config.yaml +++ b/docker-wrappers/SPRAS/example_config.yaml @@ -49,14 +49,14 @@ containers: algorithms: - name: "pathlinker" - params: - include: false + include: false + runs: run1: k: range(100,201,100) - name: "omicsintegrator1" - params: - include: true + include: true + runs: run1: r: [5] b: [5, 6] @@ -65,8 +65,8 @@ algorithms: d: [10] - name: "omicsintegrator2" - params: - include: true + include: true + runs: run1: b: [4] g: [0] @@ -75,27 +75,26 @@ algorithms: g: [3] - name: "meo" - params: - include: true + include: true + runs: run1: max_path_length: [3] local_search: [true] rand_restarts: [10] - name: "mincostflow" - params: - include: true + include: true + runs: run1: flow: [1] # The flow must be an int capacity: [1] - name: "allpairs" - params: - include: true + include: true - name: "domino" - params: - include: true + include: true + runs: run1: slice_threshold: [0.3] module_threshold: [0.05] diff --git a/docs/_static/config/intermediate.yaml b/docs/_static/config/intermediate.yaml index 0e0439e96..58d1400d8 100644 --- a/docs/_static/config/intermediate.yaml +++ b/docs/_static/config/intermediate.yaml @@ -23,15 +23,15 @@ containers: algorithms: - name: "pathlinker" - params: - include: true + include: true + runs: run1: k: 1 run2: k: [10, 100] - name: omicsintegrator1 - params: - include: true + include: true + runs: run1: b: [0.55, 2, 10] d: 10 @@ -40,8 +40,8 @@ algorithms: w: 0.1 mu: 0.008 - name: omicsintegrator2 - params: - include: true + include: true + runs: run1: b: 4 g: 0 @@ -49,24 +49,23 @@ algorithms: b: 2 g: 3 - name: meo - params: - include: true + include: true + runs: run1: local_search: [true, false] max_path_length: [2, 3] rand_restarts: 10 - name: allpairs - params: - include: true + include: true - name: domino - params: - include: true + include: true + runs: run1: slice_threshold: 0.3 module_threshold: 0.05 - name: mincostflow - params: - include: true + include: true + runs: run1: capacity: 15 flow: 80 @@ -77,14 +76,14 @@ algorithms: capacity: 5 flow: 60 - name: "strwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] - name: "rwr" - params: - include: true + include: true + runs: run1: alpha: [0.85] threshold: [100, 200] diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 8002957d0..fc23975b9 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -8,7 +8,7 @@ from typing import Annotated, Any, Callable, Literal, Optional, Union, cast, get_args import numpy as np -from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, create_model +from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, ValidationError, create_model from spras.runner import algorithms @@ -84,7 +84,7 @@ def list_coerce(value: Any) -> Any: # This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection # and preserve rich type information at runtime. -def construct_algorithm_model(name: str, model: type[BaseModel], model_default: Optional[BaseModel]) -> type[BaseModel]: +def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseModel]: """ Dynamically constructs a parameter-combination model based on the original args model. @@ -92,6 +92,13 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: - Values can be passed as lists (1 -> [1]) - Ranges and other convenient calls are expanded (see `python_evalish_coerce`) """ + + # Get the default model instance by trying to serialize the empty dictionary + try: + model_default = model.model_validate({}) + except ValidationError: + model_default = None + # First, we need to take our 'model' and coerce it to permit parameter combinations. # This assumes that all of the keys are flattened, so we only get a structure like so: # class AlgorithmParams(BaseModel): @@ -164,6 +171,6 @@ def construct_algorithm_model(name: str, model: type[BaseModel], model_default: __config__=ConfigDict(extra='forbid') ) -algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model, model_default) for name, (_, model, model_default) in algorithms.items()] +algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model.get_params_generic()) for name, model in algorithms.items()] # name differentiates algorithms AlgorithmUnion = Annotated[Union[tuple(algorithm_models)], Field(discriminator='name')] diff --git a/spras/config/config.py b/spras/config/config.py index 469df5046..e180183cc 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -164,6 +164,7 @@ def process_algorithms(self, raw_config: RawConfig): # We create the product of all param combinations for each run param_name_list = [] + # We convert our run parameters to a dictionary, allowing us to iterate over it run_subscriptable = vars(runs[run_name]) for param in run_subscriptable: param_name_list.append(param) diff --git a/spras/prm.py b/spras/prm.py index 1af47cc58..3ec81ce46 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -42,6 +42,20 @@ def generate_inputs(data: Dataset, filename_map: dict[str, str]): """ raise NotImplementedError + @classmethod + def get_params_generic(cls) -> type[T]: + """ + Gets the class instance of the parameter type passed into T, allowing us to use the + underlying pydantic model associated to it. + + For example, on `class PathLinker(PRM[PathLinkerParams])`, + calling `PathLinker.get_params_generic()` returns `PathLinkerParams`. + """ + # TODO: use the type-safe get_original_bases when we bump to >= Python 3.12 + # This is hacky reflection from https://stackoverflow.com/a/71720366/7589775 + # which grabs the class of type T by the definition of `__orig_bases__`. + return get_args(cast(Any, cls).__orig_bases__[0])[0] + # This is used in `runner.py` to avoid a dependency diamond when trying # to import the actual algorithm schema. @classmethod @@ -49,11 +63,7 @@ def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | o """ This is similar to PRA.run, but it does pydantic logic internally to re-validate argument parameters. """ - # Gets the parameter type passed into T, allowing us to use the - # underlying pydantic model associated to it. (TODO: use get_original_bases when we bump to >= Python 3.12) - # This is hacky reflection from https://stackoverflow.com/a/71720366/7589775 - # which grabs the class of type T - T_class: type[T] = get_args(cast(Any, cls).__orig_bases__[0])[0] + T_class = cls.get_params_generic() # Since we just used reflection, we provide a mountain-dewey error message here # to protect against any developer confusion. diff --git a/spras/runner.py b/spras/runner.py index 05cb0f155..73d059b3c 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,42 +1,39 @@ -from typing import Any, Optional - -from pydantic import BaseModel +from typing import Any # supported algorithm imports from spras.allpairs import AllPairs from spras.btb import BowTieBuilder -from spras.config.util import Empty from spras.dataset import Dataset -from spras.domino import DOMINO, DominoParams -from spras.meo import MEO, MEOParams -from spras.mincostflow import MinCostFlow, MinCostFlowParams -from spras.omicsintegrator1 import OmicsIntegrator1, OmicsIntegrator1Params -from spras.omicsintegrator2 import OmicsIntegrator2, OmicsIntegrator2Params -from spras.pathlinker import PathLinker, PathLinkerParams +from spras.domino import DOMINO +from spras.meo import MEO +from spras.mincostflow import MinCostFlow +from spras.omicsintegrator1 import OmicsIntegrator1 +from spras.omicsintegrator2 import OmicsIntegrator2 +from spras.pathlinker import PathLinker from spras.prm import PRM -from spras.responsenet import ResponseNet, ResponseNetParams -from spras.rwr import RWR, RWRParams -from spras.strwr import ST_RWR, ST_RWRParams +from spras.responsenet import ResponseNet +from spras.rwr import RWR +from spras.strwr import ST_RWR # Algorithm names to a three-tuple of (PRM, BaseModel, default BaseModel or None if there are no good defaults). # This is used for the configuration and to fetch algorithms during reconstruction -algorithms: dict[str, tuple[type[PRM], type[BaseModel], Optional[BaseModel]]] = { - "allpairs": (AllPairs, Empty, Empty()), - "bowtiebuilder": (BowTieBuilder, Empty, Empty()), - "domino": (DOMINO, DominoParams, DominoParams()), - "meo": (MEO, MEOParams, MEOParams()), - "mincostflow": (MinCostFlow, MinCostFlowParams, MinCostFlowParams()), - "omicsintegrator1": (OmicsIntegrator1, OmicsIntegrator1Params, None), - "omicsintegrator2": (OmicsIntegrator2, OmicsIntegrator2Params, OmicsIntegrator2Params()), - "pathlinker": (PathLinker, PathLinkerParams, PathLinkerParams()), - "responsenet": (ResponseNet, ResponseNetParams, ResponseNetParams()), - "rwr": (RWR, RWRParams, None), - "strwr": (ST_RWR, ST_RWRParams, None), +algorithms: dict[str, type[PRM]] = { + "allpairs": AllPairs, + "bowtiebuilder": BowTieBuilder, + "domino": DOMINO, + "meo": MEO, + "mincostflow": MinCostFlow, + "omicsintegrator1": OmicsIntegrator1, + "omicsintegrator2": OmicsIntegrator2, + "pathlinker": PathLinker, + "responsenet": ResponseNet, + "rwr": RWR, + "strwr": ST_RWR, } def get_algorithm(algorithm: str) -> type[PRM]: try: - return algorithms[algorithm.lower()][0] + return algorithms[algorithm.lower()] except KeyError as exc: raise NotImplementedError(f'{algorithm} is not currently supported.') from exc From ee4157588f5be40b97e42c4ce8c4d7779cdda424 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 6 Dec 2025 05:25:29 +0000 Subject: [PATCH 63/68] docs: better algorithm fail message --- spras/config/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index fc23975b9..4e1186c5f 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -60,7 +60,7 @@ def python_evalish_coerce(value: Any) -> Any: # This should always be an Expression whose body is Call (a function). if not isinstance(value_ast.body, ast.Call): - raise ValueError(f'The python code "{value}" should be calling a function directly. Is this meant to be python code?') + raise ValueError(f'This argument "{value}" was interpreted as a non-function-calling string: it should be a function call (e.g. range(100, 201, 50)), or an int or a float.') # We get the function name back as a string function_name = ast.unparse(value_ast.body.func) From a5832eed5221d5a905109eca4809f6a6c3d8be95 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 6 Dec 2025 05:48:12 +0000 Subject: [PATCH 64/68] docs: more PRA#run documentation --- spras/config/algorithms.py | 11 +++++++++-- spras/prm.py | 3 +++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 4e1186c5f..552fbc4e0 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -5,10 +5,17 @@ """ import ast import copy -from typing import Annotated, Any, Callable, Literal, Optional, Union, cast, get_args +from typing import Annotated, Any, Callable, Literal, Union, cast, get_args import numpy as np -from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, ValidationError, create_model +from pydantic import ( + BaseModel, + BeforeValidator, + ConfigDict, + Field, + ValidationError, + create_model, +) from spras.runner import algorithms diff --git a/spras/prm.py b/spras/prm.py index 3ec81ce46..636859cf2 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -83,6 +83,9 @@ def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, ar """ Runs an algorithm with the specified inputs, algorithm params (T), the designated output_file, and the desired container_settings. + + See the algorithm-specific `generate_inputs` and `parse_output` + for information about the input and output format. """ raise NotImplementedError From 9fd5b0e76dfb0d1e3d7d82a0728732ed6c1fdcfb Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 6 Dec 2025 06:00:17 +0000 Subject: [PATCH 65/68] docs: add generate inputs info --- spras/allpairs.py | 5 ++++- spras/btb.py | 8 ++++++++ spras/domino.py | 7 +++++++ spras/meo.py | 6 ++++-- spras/mincostflow.py | 5 ++++- spras/omicsintegrator1.py | 6 ++++-- spras/omicsintegrator2.py | 7 ++++--- spras/pathlinker.py | 5 +++-- spras/responsenet.py | 5 ++++- spras/rwr.py | 7 +++++++ spras/strwr.py | 8 ++++++++ 11 files changed, 57 insertions(+), 12 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index 25c838cef..21fca6ee4 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -25,7 +25,10 @@ def generate_inputs(data: Dataset, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - nodetypes: node types with sources and targets + - network: network file containing edges and their weights + - directed_flag: contains `true` if `network` is fully directed. """ AllPairs.validate_required_inputs(filename_map) diff --git a/spras/btb.py b/spras/btb.py index 439792bff..d2f18debe 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -32,6 +32,14 @@ class BowTieBuilder(PRM[Empty]): #generate input taken from meo.py because they have same input requirements @staticmethod def generate_inputs(data, filename_map): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - sources: NODEID-headered list of sources + - targets: NODEID-headered list of targets + - edges: node pairs with associated edge weights + """ BowTieBuilder.validate_required_inputs(filename_map) # Get sources and write to file, repeat for targets diff --git a/spras/domino.py b/spras/domino.py index 0f00a85b6..316044432 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -44,6 +44,13 @@ class DOMINO(PRM[DominoParams]): @staticmethod def generate_inputs(data, filename_map): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - network: list of edges + - active_genes: list of active genes + """ DOMINO.validate_required_inputs(filename_map) # Get active genes for node input file diff --git a/spras/meo.py b/spras/meo.py index 6be270bf5..5d4630f43 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -111,8 +111,10 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - sources: list of sources + - targets: list of targets + - edges: list of edges """ MEO.validate_required_inputs(filename_map) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index fa65559d1..dad1d706c 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -47,7 +47,10 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - sources: list of sources + - targets: list of targets + - edges: list of edges """ MinCostFlow.validate_required_inputs(filename_map) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 1a20e1558..3e5cbf1b7 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -116,8 +116,10 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - prizes: list of nodes associated with their prize + - edges: list of edges associated with their weight and directionality + - dummy_nodes: list of dummy nodes """ OmicsIntegrator1.validate_required_inputs(filename_map) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 5a869eaed..25a6cdc2b 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -69,10 +69,11 @@ class OmicsIntegrator2(PRM[OmicsIntegrator2Params]): @staticmethod def generate_inputs(data: Dataset, filename_map): """ - Access fields from the dataset and write the required input files. - Automatically converts edge weights to edge costs. + Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - prizes: list of nodes associated with their prize + - edges: list of edges associated with their cost """ OmicsIntegrator2.validate_required_inputs(filename_map) diff --git a/spras/pathlinker.py b/spras/pathlinker.py index cf987805b..c534f2944 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -40,8 +40,9 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - nodetypes: list of nodes tagged with whether they are a source or a target + - network: list of edges """ PathLinker.validate_required_inputs(filename_map) diff --git a/spras/responsenet.py b/spras/responsenet.py index 92e85245b..4989de482 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -41,7 +41,10 @@ def generate_inputs(data, filename_map): """ Access fields from the dataset and write the required input files @param data: dataset - @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - sources: list of sources + - targets: list of targets + - edges: list of edges """ ResponseNet.validate_required_inputs(filename_map) diff --git a/spras/rwr.py b/spras/rwr.py index cfa30b73d..e6f54d674 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -31,6 +31,13 @@ class RWR(PRM[RWRParams]): @staticmethod def generate_inputs(data, filename_map): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - nodes: list of active nodes + - network: list of edges + """ RWR.validate_required_inputs(filename_map) # Get sources and targets for node input file diff --git a/spras/strwr.py b/spras/strwr.py index 6cd0715dd..42928e4cd 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -31,6 +31,14 @@ class ST_RWR(PRM[ST_RWRParams]): @staticmethod def generate_inputs(data, filename_map): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: + - sources: list of sources + - targets: list of targets + - network: list of edges + """ ST_RWR.validate_required_inputs(filename_map) # Get separate source and target nodes for source and target files From 75c20352d5d9d0ce490f2ee22ed24a9cd5a0e507 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 6 Dec 2025 06:08:17 +0000 Subject: [PATCH 66/68] docs: drop unused cmt --- spras/runner.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spras/runner.py b/spras/runner.py index 73d059b3c..d138d8e33 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -15,8 +15,6 @@ from spras.rwr import RWR from spras.strwr import ST_RWR -# Algorithm names to a three-tuple of (PRM, BaseModel, default BaseModel or None if there are no good defaults). -# This is used for the configuration and to fetch algorithms during reconstruction algorithms: dict[str, type[PRM]] = { "allpairs": AllPairs, "bowtiebuilder": BowTieBuilder, From 70175c8c6f609a791ef95fe0132f12d6e0b3f312 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sun, 21 Dec 2025 01:09:13 +0000 Subject: [PATCH 67/68] docs(contributing): defer to pathlinker for dockerfile --- docs/contributing/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index 17b956d80..b0780619f 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -117,7 +117,7 @@ Step 2: Create a Local Neighborhood Docker image Complete the ``Dockerfile`` in the ``docker-wrappers/LocalNeighborhood`` -directory to create a Docker image. The AllPairs ``Dockerfile`` +directory to create a Docker image. The PathLinker ``Dockerfile`` demonstrates how to begin with a Python image and copy files into the image with ``COPY``. Browse the official `Python images `__ to select a recent version From 569a49b7c8fa5fac8c85ba68456fd9f7dc7cd898 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sun, 21 Dec 2025 01:09:30 +0000 Subject: [PATCH 68/68] docs(oi2): mention weight to cost transformation --- spras/omicsintegrator2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 25a6cdc2b..38624d3ab 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -73,7 +73,7 @@ def generate_inputs(data: Dataset, filename_map): @param data: dataset @param filename_map: a dict mapping file types in the required_inputs to the filename for that type. Associated files will be written with: - prizes: list of nodes associated with their prize - - edges: list of edges associated with their cost + - edges: list of edges associated with their cost (transformed from the original Dataset weights) """ OmicsIntegrator2.validate_required_inputs(filename_map)