diff --git a/.github/workflows/build-containers.yml b/.github/workflows/build-containers.yml index 8ebc59435..f371e30fd 100644 --- a/.github/workflows/build-containers.yml +++ b/.github/workflows/build-containers.yml @@ -48,6 +48,11 @@ jobs: with: path: docker-wrappers/Cytoscape container: reedcompbio/py4cytoscape + build-and-remove-capdsd: + uses: "./.github/workflows/build-and-remove-template.yml" + with: + path: docker-wrappers/capDSD + container: reedcompbio/capdsd build-and-remove-spras: uses: "./.github/workflows/build-and-remove-template.yml" with: diff --git a/Snakefile b/Snakefile index df90f8e4a..d9ff74e56 100644 --- a/Snakefile +++ b/Snakefile @@ -6,6 +6,7 @@ from spras.dataset import Dataset from spras.evaluation import Evaluation from spras.analysis import ml, summary, graphspace, cytoscape import spras.config as _config +from spras.util import extend_filename # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them @@ -189,7 +190,9 @@ checkpoint prepare_input: # Use the algorithm's generate_inputs function to load the merged dataset, extract the relevant columns, # and write the output files specified by required_inputs # The filename_map provides the output file path for each required input file type - filename_map = {input_type: SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs', f'{input_type}.txt']) for input_type in runner.get_required_inputs(wildcards.algorithm)} + filename_map = {input_type: SEP.join( + [out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs', extend_filename(input_type)] + ) for input_type in runner.get_required_inputs(wildcards.algorithm)} runner.prepare_inputs(wildcards.algorithm, input.dataset_file, filename_map) # Collect the prepared input files from the specified directory @@ -207,7 +210,7 @@ def collect_prepared_input(wildcards): prepared_dir = SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs']) # Construct the list of expected prepared input files for the reconstruction algorithm - prepared_inputs = expand(f'{prepared_dir}{SEP}{{type}}.txt',type=runner.get_required_inputs(algorithm=wildcards.algorithm)) + prepared_inputs = expand(f'{prepared_dir}{SEP}{{type}}',type=map(extend_filename, runner.get_required_inputs(algorithm=wildcards.algorithm))) # If the directory is missing, do nothing because the missing output triggers running prepare_input if os.path.isdir(prepared_dir): # If the directory exists, confirm all prepared input files exist as well (as opposed to some or none) @@ -238,7 +241,10 @@ rule reconstruct: # Create a copy so that the updates are not written to the parameters logfile params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() # Add the input files - params.update(dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True))) + params.update(dict(zip( + [inp.replace(".", "_") for inp in runner.get_required_inputs(wildcards.algorithm)], + *{input}, strict=True + ))) # Add the output file # All run functions can accept a relative path to the output file that should be written that is called 'output_file' params['output_file'] = output.pathway_file diff --git a/config/config.yaml b/config/config.yaml index 4f16beded..7e52876f6 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -24,7 +24,7 @@ container_registry: base_url: docker.io # The owner or project of the registry # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs - owner: reedcompbio + owner: pubtristanf # This list of algorithms should be generated by a script which checks the filesystem for installs. # It shouldn't be changed by mere mortals. (alternatively, we could add a path to executable for each algorithm @@ -96,6 +96,10 @@ algorithms: slice_threshold: [0.3] module_threshold: [0.05] + - name: "capdsd" + params: + include: true + # Here we specify which pathways to run and other file location information. # DataLoader.py can currently only load a single dataset # Assume that if a dataset label does not change, the lists of associated input files do not change diff --git a/docker-wrappers/capDSD/Dockerfile b/docker-wrappers/capDSD/Dockerfile new file mode 100644 index 000000000..4fb7b38ae --- /dev/null +++ b/docker-wrappers/capDSD/Dockerfile @@ -0,0 +1,8 @@ +FROM python:2.7.18 + +RUN pip install numpy==1.16.6 +# Since this is an arbitrary internet ZIP file, we use the web archive link instead. +# TODO: checksum? +RUN wget https://web.archive.org/web/20250616194746/http://dsd.cs.tufts.edu/capdsd/files//capDSD-src.zip + +RUN unzip capDSD-src.zip -d capDSD/ diff --git a/docker-wrappers/capDSD/README.md b/docker-wrappers/capDSD/README.md new file mode 100644 index 000000000..caa8b7400 --- /dev/null +++ b/docker-wrappers/capDSD/README.md @@ -0,0 +1,3 @@ +# capDSD Docker Image + +A Docker image for [capDSD](https://doi.org/10.1093/bioinformatics/btu263) that is available on [DockerHub](https://hub.docker.com/repository/docker/reedcompbio/capdsd). \ No newline at end of file diff --git a/spras/capDSD.py b/spras/capDSD.py new file mode 100644 index 000000000..f2d41c577 --- /dev/null +++ b/spras/capDSD.py @@ -0,0 +1,89 @@ +from pathlib import Path + +from spras.containers import prepare_volume, run_container_and_log +from spras.dataset import Dataset +from spras.interactome import convert_directed_to_undirected +from spras.prm import PRM + +__all__ = ['CapDSD'] + +class CapDSD(PRM): + required_inputs = ['ppi', 'ppip.ppip'] + + @staticmethod + def generate_inputs(data: Dataset, filename_map: dict[str, str]): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + """ + for input_type in CapDSD.required_inputs: + if input_type not in filename_map: + raise ValueError(f"{input_type} filename is missing") + + # create the ppi + ppi = data.get_interactome() + ppi = convert_directed_to_undirected(ppi) + ppi.to_csv(filename_map['ppi'], sep='\t', index=False, columns=["Interactor1", "Interactor2", "Weight"], + header=False) + + # then, we want to 'guide' the ppi with a .ppip file, which is a secondary, + # trusted interactome: we use the directed edges from the interactome as our + # trusted edges. + ppip = data.get_interactome() + ppip = ppip[ppip["Direction"] == "D"] + ppip.to_csv(filename_map['ppip.ppip'], sep='\t', index=False, columns=["Interactor1", "Interactor2"], header=False) + + @staticmethod + def run(ppi=None, ppip=None, output_file=None, container_framework="docker"): + """ + Run BTB with Docker + @param ppi: input interactome file containing only undirected edges (required) + @param ppip: input interactome file containing only directed edges (required) + @param output_file: path to the output matrix (required) + @param container_framework: specify a container framework + """ + if not ppi or not ppip or not output_file: + raise ValueError("Required capDSD arguments are missing") + + work_dir = '/capDSD' + + volumes = list() + + bind_path, ppi_file = prepare_volume(ppi, work_dir) + volumes.append(bind_path) + + bind_path, ppip_file = prepare_volume(ppip, work_dir) + volumes.append(bind_path) + + # Create a prefix for the output filename and ensure the directory exists + out_dir = Path(output_file).parent + out_dir.mkdir(parents=True, exist_ok=True) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + volumes.append(bind_path) + mapped_out_prefix = mapped_out_dir + '/output' + + container_suffix = "capdsd" + + # Since the volumes are binded under different folders, we can safely + # use the ppip_file's parent. + command = ['python', + '/capDSD/DSD.py', + '-pathmode', '1', + '-p', str(Path(ppip_file).parent), + ppi_file, mapped_out_prefix] + + + run_container_and_log('capDSD', + container_framework, + container_suffix, + command, + volumes, + work_dir) + + output_matrix = Path(out_dir) / 'output.dsd' + output_matrix.rename(output_file) + + @staticmethod + def parse_output(raw_pathway_file: str, standardized_pathway_file: str): + pass diff --git a/spras/runner.py b/spras/runner.py index 8490644c1..8eb42d49f 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,5 +1,6 @@ # supported algorithm imports from spras.allpairs import AllPairs as allpairs +from spras.capDSD import CapDSD as capdsd from spras.dataset import Dataset from spras.domino import DOMINO as domino from spras.meo import MEO as meo diff --git a/spras/util.py b/spras/util.py index 83cca945d..8e9b6865f 100644 --- a/spras/util.py +++ b/spras/util.py @@ -105,3 +105,13 @@ def duplicate_edges(df: pd.DataFrame) -> (pd.DataFrame, bool): unique_edges_df = df_sorted.drop_duplicates(subset=["Node1", "Node2", "Direction"], keep="first", ignore_index=True) return unique_edges_df, not unique_edges_df.equals(df) + +# https://stackoverflow.com/a/49689414/7589775 +def extend_filename(file_name: str, extension=".txt") -> str: + """ + Adds a default file extension if none is provided. + """ + root, ext = os.path.splitext(file_name) + if not ext: + ext = extension + return f'{root}{ext}' diff --git a/test/capDSD/expected/capdsd-matrix-expected.txt b/test/capDSD/expected/capdsd-matrix-expected.txt new file mode 100644 index 000000000..98ec7ffc5 --- /dev/null +++ b/test/capDSD/expected/capdsd-matrix-expected.txt @@ -0,0 +1,5 @@ + A B C D +A 0.0 1.9999962366471538 4.153838337651781 4.153838337651781 +B 1.9999962366471538 0.0 2.153842101004627 2.153842101004627 +C 4.153838337651781 2.153842101004627 0.0 0.0 +D 4.153838337651781 2.153842101004627 0.0 0.0 diff --git a/test/capDSD/input/capdsd-ppi.txt b/test/capDSD/input/capdsd-ppi.txt new file mode 100644 index 000000000..4e05c5b63 --- /dev/null +++ b/test/capDSD/input/capdsd-ppi.txt @@ -0,0 +1,2 @@ +A B 0.5 +C D 0.75 \ No newline at end of file diff --git a/test/capDSD/input/capdsd-ppip.ppip b/test/capDSD/input/capdsd-ppip.ppip new file mode 100644 index 000000000..5aa7ba857 --- /dev/null +++ b/test/capDSD/input/capdsd-ppip.ppip @@ -0,0 +1 @@ +B C \ No newline at end of file diff --git a/test/capDSD/test_capDSD.py b/test/capDSD/test_capDSD.py new file mode 100644 index 000000000..bed97c196 --- /dev/null +++ b/test/capDSD/test_capDSD.py @@ -0,0 +1,61 @@ +import filecmp +import shutil +from pathlib import Path + +import pytest + +import spras.config as config +from spras.capDSD import CapDSD + +config.init_from_file("config/config.yaml") + +TEST_DIR = Path('test', 'capDSD') +IN_DIR = TEST_DIR / 'input' +OUT_DIR = TEST_DIR / 'output' +EXPECTED_DIR = TEST_DIR / 'expected' + +INPUT_PPI = IN_DIR / 'capdsd-ppi.txt' +INPUT_PPIP = IN_DIR / 'capdsd-ppip.txt' + +OUT_FILE = OUT_DIR / 'output.txt' +EXPECTED_FILE = EXPECTED_DIR / 'capdsd-matrix-expected.txt' + +class TestCapDSD: + """ + Run capDSD tests in the Docker image + """ + def test_capdsd_required(self): + OUT_FILE.unlink(missing_ok=True) + # Only include required arguments + CapDSD.run( + ppi=INPUT_PPI, + ppip=INPUT_PPIP, + output_file=OUT_FILE + ) + assert OUT_FILE.exists() + + assert filecmp.cmp(OUT_FILE, EXPECTED_FILE) + + def test_capdsd_missing(self): + # Test the expected error is raised when required arguments are missing + with pytest.raises(ValueError): + # No PPI + CapDSD.run( + ppip=INPUT_PPIP, + output_file=OUT_FILE + ) + + # Only run Singularity test if the binary is available on the system + # spython is only available on Unix, but do not explicitly skip non-Unix platforms + @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') + def test_capdsd_singularity(self): + OUT_FILE.unlink(missing_ok=True) + # Only include required arguments and run with Singularity + CapDSD.run( + ppi=INPUT_PPI, + ppip=INPUT_PPIP, + output_file=OUT_FILE, + container_framework="singularity") + assert OUT_FILE.exists() + + assert filecmp.cmp(OUT_FILE, EXPECTED_FILE) diff --git a/test/generate-inputs/expected/capdsd-ppi-expected.txt b/test/generate-inputs/expected/capdsd-ppi-expected.txt new file mode 100644 index 000000000..8334ffd53 --- /dev/null +++ b/test/generate-inputs/expected/capdsd-ppi-expected.txt @@ -0,0 +1,2 @@ +test_A B 0.98 +B C 0.77 diff --git a/test/generate-inputs/test_generate_inputs.py b/test/generate-inputs/test_generate_inputs.py index 6d732d315..0726e919c 100644 --- a/test/generate-inputs/test_generate_inputs.py +++ b/test/generate-inputs/test_generate_inputs.py @@ -16,7 +16,8 @@ 'omicsintegrator2': 'edges', 'domino': 'network', 'pathlinker': 'network', - 'allpairs': 'network' + 'allpairs': 'network', + 'capdsd': 'ppi' } diff --git a/test/parse-outputs/test_parse_outputs.py b/test/parse-outputs/test_parse_outputs.py index 49baf10f8..309d2920c 100644 --- a/test/parse-outputs/test_parse_outputs.py +++ b/test/parse-outputs/test_parse_outputs.py @@ -12,7 +12,7 @@ # the DOMINO output of the network dip.sif and the nodes tnfa_active_genes_file.txt # from https://github.com/Shamir-Lab/DOMINO/tree/master/examples -algorithms = ['mincostflow', 'meo', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino'] +algorithms = ['mincostflow', 'meo', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino', 'capdsd'] class TestParseOutputs: