diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 689d69e..55a4046 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,7 @@ ## Helping Out -There are `TODOs` that better enhance the reproducability of datasets or analysis of algorithm outputs, as well as +There are `TODOs` that better enhance the reproducability and accuracy of datasets or analysis of algorithm outputs, as well as [open resolvable issues](https://github.com/Reed-CompBio/spras-benchmarking/). ## Adding a dataset diff --git a/cache/.gitignore b/cache/.gitignore new file mode 100644 index 0000000..de153db --- /dev/null +++ b/cache/.gitignore @@ -0,0 +1 @@ +artifacts diff --git a/cache/README.md b/cache/README.md new file mode 100644 index 0000000..d997004 --- /dev/null +++ b/cache/README.md @@ -0,0 +1,3 @@ +# cache + +Handles artifact fetching and cache. This folder has a `Snakefile` which only contains a single function used for producing fetching rules. diff --git a/cache/Snakefile b/cache/Snakefile new file mode 100644 index 0000000..2d22236 --- /dev/null +++ b/cache/Snakefile @@ -0,0 +1,34 @@ +from cache import link +from cache.util import uncompress +import urllib.parse +from dataclasses import dataclass +from typing import Union +from pathlib import Path + +@dataclass +class FetchConfig: + directive: list[str] + uncompress: bool = False + +def produce_fetch_rules(input_dict: dict[str, Union[FetchConfig, list[str]]]): + """ + Produces fetch rules based on a dictionary mapping + output files to their directory.py-based directive. + """ + # Map inputs to be wrapped with FetchConfig if list[str] + input_dict = {k: FetchConfig(v) if isinstance(v, list) else v for k, v in input_dict.items()} + + directives = [urllib.parse.quote_plus("/".join(directive.directive)) for directive in input_dict.values()] + assert len(directives) == len(set(directives)), "Directives aren't unique!" + + for output_file, config in input_dict.items(): + # Since placeholders are evaluated when the job is actually ran, + # we pass data using params and output. + rule: + name: f"fetch_{urllib.parse.quote_plus("/".join(config.directive))}_to_{urllib.parse.quote_plus(output_file)}" + output: file=output_file + params: + config=config + run: + Path(output.file).parent.mkdir(exist_ok=True) + link(Path(output.file), params.config.directive, uncompress=params.config.uncompress) diff --git a/cache/__init__.py b/cache/__init__.py index e69de29..2f15fe4 100644 --- a/cache/__init__.py +++ b/cache/__init__.py @@ -0,0 +1,82 @@ +""" +This is how spras-benchmarking handles artifact caching. `cache` should be used specifically inside `Snakefile` +""" + +from cache.util import uncompress as uncompress_file +from cache.directory import get_cache_item +from pathlib import Path +import os +from urllib.parse import quote_plus +import pickle + +__all__ = ["link"] + +dir_path = Path(os.path.dirname(os.path.realpath(__file__))) +artifacts_dir = dir_path / "artifacts" + +def get_artifact_name(directive: list[str]) -> str: + return quote_plus("/".join(directive)) + +def has_expired(directive: list[str]) -> bool: + """ + Check if the artifact metadata associated with a directive has expired. + Avoids re-downloading the artifact if nothing has changed. + """ + artifact_name = get_artifact_name(directive) + cache_item = get_cache_item(directive) + + metadata_dir = artifacts_dir / 'metadata' + metadata_dir.mkdir(exist_ok=True) + metadata_file = (artifacts_dir / 'metadata' / artifact_name).with_suffix((artifacts_dir / artifact_name).suffix + '.metadata') + + # metadata never existed: we need to retrieve the new file + if not metadata_file.exists(): + with open(metadata_file, 'wb') as f: + pickle.dump(cache_item, f) + return True + + old_cache_item = None + with open(metadata_file, 'rb') as f: + old_cache_item = pickle.load(f) + + # metadata expired: re-retrieve the item + if old_cache_item != cache_item: + with open(metadata_file, 'wb') as f: + pickle.dump(cache_item, f) + return True + + # metadata hasn't changed and already existed: this hasn't expired + return False + +def link(output: str, directive: list[str], uncompress=False): + """ + Links output files from cache.directory directives. + For example, + + ```py + link("output/ensg-ensp.tsv", ["BioMart", "ensg-ensp.tsv"]) + ``` + + would download and check BioMart's cache for ENSG-ENSP mapping, then symlink the cached output + (lying somewhere in the cache folder) with the desired `output`. + """ + + artifacts_dir.mkdir(exist_ok=True) + + artifact_name = get_artifact_name(directive) + + Path(output).unlink(missing_ok=True) + + # Re-download if the directive has expired. + cache_item = get_cache_item(directive) + if has_expired(directive): + (artifacts_dir / artifact_name).unlink(missing_ok=True) + cache_item.download(artifacts_dir / artifact_name) + + if uncompress: + uncompressed_artifact_path = Path(str(artifacts_dir / artifact_name) + '.uncompressed') + uncompressed_artifact_path.unlink(missing_ok=True) + uncompress_file(artifacts_dir / artifact_name, uncompressed_artifact_path) + Path(output).symlink_to(uncompressed_artifact_path) + else: + Path(output).symlink_to(artifacts_dir / artifact_name) diff --git a/cache/biomart/README.md b/cache/biomart/README.md index fbc9dde..d5d85c5 100644 --- a/cache/biomart/README.md +++ b/cache/biomart/README.md @@ -1,3 +1,4 @@ # BioMart XML Queries Directory for storing XML queries generated from [the BioMart interface](https://www.ensembl.org/info/data/biomart/index.html). +See the martview: https://www.ensembl.org/biomart/martview. diff --git a/cache/directory.py b/cache/directory.py index c0adeec..5df6f1b 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -12,6 +12,7 @@ dir_path = Path(os.path.dirname(os.path.realpath(__file__))) + def fetch_biomart_url(xml: str) -> str: """ Access BioMart data through the BioMart REST API: @@ -20,16 +21,31 @@ def fetch_biomart_url(xml: str) -> str: ROOT = "http://www.ensembl.org/biomart/martservice?query=" return ROOT + urllib.parse.quote_plus(xml) + @dataclass class CacheItem: """Class for differentriating between offline and online items in a cache.""" + name: str + """The display name of the artifact, used for human-printing.""" cached: str online: str + @classmethod + def cache_only(cls, name: str, cached: str) -> "CacheItem": + """Wrapper method to explicitly declare a CacheItem as cached only.""" + return cls(name=name, online=cached, cached="") + def download(self, output: str | PathLike): + print(f"Fetching {self.name}...") print(f"Downloading {self.online}...") + if self.cached == "": + # From CacheItem.cached_only + # (gdown doesn't take in Paths for the output_file, so we must stringify it here) + gdown.download(self.online, str(output)) + return + urllib.request.urlretrieve(self.online, output) with NamedTemporaryFile() as cached_file: @@ -45,14 +61,16 @@ def download(self, output: str | PathLike): directory: CacheDirectory = { "STRING": { "9606": { - "links": CacheItem( + "9606.protein.links.txt.gz": CacheItem( + name="STRING 9606 protein links", cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj", online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz", ), - "aliases": CacheItem( + "9606.protein.aliases.txt.gz": CacheItem( + name="STRING 9606 protein aliases", cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY", online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz", - ) + ), } }, "UniProt": { @@ -60,67 +78,80 @@ def download(self, output: str | PathLike): "9606": { # We prefer manually curated genes. "SwissProt_9606.tsv": CacheItem( + name="UniProt 9606 SwissProt genes", cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk", - online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29" + online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29", ), "HUMAN_9606_idmapping_selected.tab.gz": CacheItem( + name="UniProt 9606 ID external database mapping", cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX", - online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz" + online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz", ), "HUMAN_9606_idmapping.dat.gz": CacheItem( + name="UniProt 9606 internal id mapping", cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O", - online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz" - ) + online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz", + ), } }, "DISEASES": { # Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their # archived files directory instead. "tiga_gene-trait_stats.tsv": CacheItem( + name="TIGA data", cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK", online="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv", ), "HumanDO.tsv": CacheItem( + name="Disease ontology data", cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi", online="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv", ), "human_disease_textmining_filtered.tsv": CacheItem( + name="DISEASES textmining channel", cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D", online="https://download.jensenlab.org/human_disease_textmining_filtered.tsv", ), "human_disease_knowledge_filtered.tsv": CacheItem( + name="DISEASES knowledge channel", cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld", online="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv", ), }, "BioMart": { "ensg-ensp.tsv": CacheItem( + name="BioMart ENSG <-> ENSP mapping", cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL", - online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text()) + online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text()), ) }, "DepMap": { "OmicsProfiles.csv": CacheItem( + name="DepMap omics metadata", cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads" + online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads", ), "CRISPRGeneDependency.csv": CacheItem( + name="DepMap gene dependency probability estimates", cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads" + online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads", ), "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem( + name="DepMap genotyped matrix", cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads" + online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads", ), "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem( + name="DepMap model-level TPMs", cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads" + online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads", ), "OmicsCNGeneWGS.csv": CacheItem( + name="DepMap gene-level copy number data", cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads" - ) - } + online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads", + ), + }, } diff --git a/cache/index.py b/cache/index.py deleted file mode 100644 index 385a2ff..0000000 --- a/cache/index.py +++ /dev/null @@ -1 +0,0 @@ -# Artifact caching diff --git a/databases/util.py b/cache/util.py similarity index 99% rename from databases/util.py rename to cache/util.py index 8ffb500..da17d2b 100644 --- a/databases/util.py +++ b/cache/util.py @@ -2,6 +2,7 @@ import gzip import shutil + def uncompress(source: Path, target: Path): """Uncompresses a .gz file""" # Uncompressing a .gz file: https://stackoverflow.com/a/44712152/7589775 diff --git a/databases/README.md b/databases/README.md deleted file mode 100644 index a87fa4a..0000000 --- a/databases/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# databases - -A catalog of CLIs wrapping various common background PPI databases. diff --git a/databases/__init__.py b/databases/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/databases/stringdb.py b/databases/stringdb.py deleted file mode 100644 index 3c6717a..0000000 --- a/databases/stringdb.py +++ /dev/null @@ -1,49 +0,0 @@ -import argparse -import os -from pathlib import Path -from databases.util import uncompress - -from cache.directory import get_cache_item - -# https://stackoverflow.com/a/5137509/7589775 -dir_path = os.path.dirname(os.path.realpath(__file__)) - -string_path = Path(dir_path, "string") - - -def parse_args(): - parser = argparse.ArgumentParser( - prog="STRING DB Fetcher", description="Downloads specified STRING DB background interactomes from a specific organism." - ) - - parser.add_argument( - "-i", - "--id", - help=""" - The specified organism ID to use. - See https://string-db.org/cgi/download for more info. - For example, 9606 is the homo sapiens background interactome. - For an example usage, see datasets/diseases's Snakefile. - """, - type=int, - required=True, - ) - - return parser.parse_args() - -def main(): - args = parse_args() - string_path.mkdir(exist_ok=True) - - # We download the links file - links_file = string_path / f"{args.id}.protein.links.v12.0.txt.gz" - get_cache_item(["STRING", str(args.id), "links"]).download(links_file) - uncompress(links_file, links_file.with_suffix("")) # an extra call of with_suffix strips the `.gz` prefix - - # and its associated aliases - aliases_file = string_path / f"{args.id}.protein.aliases.v12.0.txt.gz" - get_cache_item(["STRING", str(args.id), "aliases"]).download(aliases_file) - uncompress(aliases_file, aliases_file.with_suffix("")) - -if __name__ == "__main__": - main() diff --git a/datasets/depmap/Snakefile b/datasets/depmap/Snakefile index 2182663..b98db0d 100644 --- a/datasets/depmap/Snakefile +++ b/datasets/depmap/Snakefile @@ -1,3 +1,5 @@ +include: "../../cache/Snakefile" + rule all: # We currently only care about the FADU cell line. input: @@ -5,18 +7,16 @@ rule all: "processed/FADU_cell_line_prizes.txt", "processed/FADU_gold_standard_thresh_0_5.txt" -rule fetch: - output: - "raw/CRISPRGeneDependency.csv", - "raw/OmicsProfiles.csv", - "raw/OmicsSomaticMutationsMatrixDamaging.csv", - "raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv", - "raw/OmicsCNGeneWGS.csv", - "raw/HUMAN_9606_idmapping.tsv", - "raw/HUMAN_9606_idmapping_selected.tsv", - "raw/SwissProt_9606.tsv" - shell: - "uv run scripts/fetch.py" +produce_fetch_rules({ + "raw/CRISPRGeneDependency.csv": ["DepMap", "CRISPRGeneDependency.csv"], + "raw/OmicsProfiles.csv": ["DepMap", "OmicsProfiles.csv"], + "raw/OmicsSomaticMutationsMatrixDamaging.csv": ["DepMap", "OmicsSomaticMutationsMatrixDamaging.csv"], + "raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv": ["DepMap", "OmicsExpressionProteinCodingGenesTPMLogp1.csv"], + "raw/OmicsCNGeneWGS.csv": ["DepMap", "OmicsCNGeneWGS.csv"], + "raw/HUMAN_9606_idmapping.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping.dat.gz"], uncompress=True), + "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True), + "raw/SwissProt_9606.tsv": ["UniProt", "9606", "SwissProt_9606.tsv"], +}) rule mapping: input: diff --git a/datasets/depmap/scripts/cell_line_processing.py b/datasets/depmap/scripts/cell_line_processing.py index 0fad7d9..7ab5dbe 100644 --- a/datasets/depmap/scripts/cell_line_processing.py +++ b/datasets/depmap/scripts/cell_line_processing.py @@ -131,6 +131,7 @@ def process_single_cell_line( print(f"Processing for cell line '{cell_line_name}' completed successfully.") return True + def generate_gold_standard(cell_line_name, model_id, CRISPR_dependency, gene_to_uniprot, threshold: float): """Generate gold standard file for the cell line based on CRISPR dependency and gene to Uniprot mapping.""" # map Uniprot IDs to gene symbols in the CRISPR dependency data diff --git a/datasets/depmap/scripts/fetch.py b/datasets/depmap/scripts/fetch.py deleted file mode 100644 index b922312..0000000 --- a/datasets/depmap/scripts/fetch.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Fetches the latest DepMap data we need - -Download page: https://depmap.org/portal/data_page/?tab=allData -""" - -from pathlib import Path -import os -from cache.directory import get_cache_item -from databases.util import uncompress - -# https://stackoverflow.com/a/5137509/7589775 -dir_path = os.path.dirname(os.path.realpath(__file__)) - -raw_dir = Path(dir_path, "..", "raw") - - -def main(): - raw_dir.mkdir(exist_ok=True) - - print("Fetching DepMap omics metadata") - get_cache_item(["DepMap", "OmicsProfiles.csv"]).download(raw_dir / "OmicsProfiles.csv") - - print("Fetching DepMap gene dependency probability estimates...") - get_cache_item(["DepMap", "CRISPRGeneDependency.csv"]).download(raw_dir / "CRISPRGeneDependency.csv") - - print("Fetching DepMap genotyped matrix...") - get_cache_item(["DepMap", "OmicsSomaticMutationsMatrixDamaging.csv"]).download(raw_dir / "OmicsSomaticMutationsMatrixDamaging.csv") - - print("Fetching DepMap model-level TPMs...") - get_cache_item(["DepMap", "OmicsExpressionProteinCodingGenesTPMLogp1.csv"]).download(raw_dir / "OmicsExpressionProteinCodingGenesTPMLogp1.csv") - - print("Fetching DepMap gene-level copy number data...") - get_cache_item(["DepMap", "OmicsCNGeneWGS.csv"]).download(raw_dir / "OmicsCNGeneWGS.csv") - - print("Fetching UniProt internal id mapping...") - get_cache_item(["UniProt", "9606", "HUMAN_9606_idmapping.dat.gz"]).download(raw_dir / "HUMAN_9606_idmapping.dat.gz") - uncompress(raw_dir / "HUMAN_9606_idmapping.dat.gz", raw_dir / "HUMAN_9606_idmapping.tsv") - - print("Fetching UniProt id external database mapping...") - get_cache_item(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"]).download(raw_dir / "HUMAN_9606_idmapping_selected.tab.gz") - uncompress(raw_dir / "HUMAN_9606_idmapping_selected.tab.gz", raw_dir / "HUMAN_9606_idmapping_selected.tsv") - - print("Fetching UniProt SwissProt genes...") - get_cache_item(["UniProt", "9606", "SwissProt_9606.tsv"]).download(raw_dir / "SwissProt_9606.tsv") - -if __name__ == "__main__": - main() diff --git a/datasets/depmap/scripts/uniprot_mapping.py b/datasets/depmap/scripts/uniprot_mapping.py index 4198366..71fc1cb 100644 --- a/datasets/depmap/scripts/uniprot_mapping.py +++ b/datasets/depmap/scripts/uniprot_mapping.py @@ -15,10 +15,12 @@ def extract_gene_symbols(input_df: pd.DataFrame) -> pd.DataFrame: gene_columns = input_df.columns.tolist()[1:] gene_symbols = [ # We want to extract GENE_NAME from GENE_NAME (Unknown) - (col[:col.find("(") - 1], None) if "(Unknown)" in col else + (col[: col.find("(") - 1], None) + if "(Unknown)" in col # or GENE_ID from "GENE_NAME (GENE_ID)" - (col[:col.find("(") - 1], col[col.find("(") + 1:-1]) if "(" in col else - (col, None) + else (col[: col.find("(") - 1], col[col.find("(") + 1 : -1]) + if "(" in col + else (col, None) for col in gene_columns ] @@ -50,19 +52,22 @@ def main(): # while idmapping will be used for GeneSymbol -> UniProtKB-AC mapping. # We'll also take the idmapping data and trim for specifically Swiss-Prot (curated) genes. - curated_df = pd.read_csv(dir_path / ".." / "raw" / "SwissProt_9606.tsv", sep='\t', usecols=["Entry", "Entry Name", "Gene Names"]) + curated_df = pd.read_csv(dir_path / ".." / "raw" / "SwissProt_9606.tsv", sep="\t", usecols=["Entry", "Entry Name", "Gene Names"]) curated_df.columns = ["UniProtKB-AC", "Entry Name", "Gene Names"] idmapping_df = pd.read_csv( - dir_path / ".." / "raw" / "HUMAN_9606_idmapping.tsv", - header=None, names=["UniProtKB-AC", "ID_type", "Value"], sep='\t') + dir_path / ".." / "raw" / "HUMAN_9606_idmapping.tsv", header=None, names=["UniProtKB-AC", "ID_type", "Value"], sep="\t" + ) idmapping_df = idmapping_df[idmapping_df["ID_type"] == "Gene_Name"].drop(columns=["ID_type"]).rename(columns={"Value": "GeneSymbol"}) idmapping_df = idmapping_df.merge(curated_df, on="UniProtKB-AC", how="inner") gene_symbols_df_nid = gene_symbols_df_nid.merge(idmapping_df, on="GeneSymbol", how="inner").drop(columns=["GeneID"]) idmapping_selected_df = pd.read_csv( dir_path / ".." / "raw" / "HUMAN_9606_idmapping_selected.tsv", - header=None, usecols=[0, 1, 2], names=["UniProtKB-AC", "UniProtKB-ID", "GeneID"], sep='\t' + header=None, + usecols=[0, 1, 2], + names=["UniProtKB-AC", "UniProtKB-ID", "GeneID"], + sep="\t", ) idmapping_selected_df = idmapping_selected_df[~idmapping_selected_df["GeneID"].isna()] idmapping_selected_df = idmapping_selected_df.merge(curated_df, on="UniProtKB-AC", how="inner") @@ -72,7 +77,7 @@ def main(): gene_symbol_df = gene_symbols_df_id.merge(gene_symbols_df_nid, on=["GeneSymbol", "UniProtKB-AC", "Entry Name", "Gene Names"], how="outer") gene_symbol_df = gene_symbol_df.drop(columns=["Gene Names"]) gene_symbol_df = gene_symbol_df.rename(columns={"GeneSymbol": "From"}) - gene_symbol_df.to_csv(dir_path / ".." / "processed" / "DamagingMutations_idMapping.tsv", sep='\t', index=False) + gene_symbol_df.to_csv(dir_path / ".." / "processed" / "DamagingMutations_idMapping.tsv", sep="\t", index=False) if __name__ == "__main__": diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile index fcaa100..0455b57 100644 --- a/datasets/diseases/Snakefile +++ b/datasets/diseases/Snakefile @@ -1,3 +1,5 @@ +include: "../../cache/Snakefile" + rule all: input: "GS_files/Alopecia_areata_GS.txt", @@ -5,27 +7,21 @@ rule all: "prize_files/alopecia_areata_prizes.txt", "prize_files/diabetes_mellitus_prizes.txt" -rule of_db: - output: - "../../databases/string/9606.protein.links.v12.0.txt", - "../../databases/string/9606.protein.aliases.v12.0.txt" - shell: - "uv run ../../databases/stringdb.py --id 9606" - -rule fetch: - output: - "raw/human_disease_knowledge_filtered.tsv", - "raw/human_disease_textmining_filtered.tsv", - "raw/HumanDO.tsv", - "raw/tiga_gene-trait_stats.tsv" - shell: - "uv run scripts/fetch.py" +produce_fetch_rules({ + "raw/human_disease_textmining_filtered.tsv": ["DISEASES", "human_disease_textmining_filtered.tsv"], + "raw/human_disease_knowledge_filtered.tsv": ["DISEASES", "human_disease_knowledge_filtered.tsv"], + "raw/HumanDO.tsv": ["DISEASES", "HumanDO.tsv"], + "raw/tiga_gene-trait_stats.tsv": ["DISEASES", "tiga_gene-trait_stats.tsv"], + "raw/ensg-ensp.tsv": ["BioMart", "ensg-ensp.tsv"], + "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True), + "raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True), +}) rule inputs: input: "raw/HumanDO.tsv", "raw/tiga_gene-trait_stats.tsv", - "../../databases/string/9606.protein.aliases.v12.0.txt" + "raw/9606.protein.aliases.txt" output: "data/inputs.csv" shell: @@ -35,7 +31,8 @@ rule gold_standard: input: "raw/human_disease_knowledge_filtered.tsv", "raw/human_disease_textmining_filtered.tsv", - "../../databases/string/9606.protein.aliases.v12.0.txt" + "raw/9606.protein.aliases.txt", + "raw/ensg-ensp.tsv" output: "data/gold_standard.csv" shell: @@ -45,7 +42,7 @@ rule files: input: "data/inputs.csv", "data/gold_standard.csv", - "../../databases/string/9606.protein.links.v12.0.txt" + "raw/9606.protein.links.txt" output: # These are the two we use for the SPRAS run for now "GS_files/Alopecia_areata_GS.txt", diff --git a/datasets/diseases/scripts/fetch.py b/datasets/diseases/scripts/fetch.py deleted file mode 100644 index 8daf58d..0000000 --- a/datasets/diseases/scripts/fetch.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Fetches the latest DISEASES database channels, TIGA data, and human disease ontology data that we need. - -Download pages: -- DISEASES: https://diseases.jensenlab.org/Downloads -- TIGA: https://unmtid-shinyapps.net/shiny/tiga/ -- Disease Ontology: https://disease-ontology.org/downloads/ -""" - -from pathlib import Path -import os -from cache.directory import get_cache_item - -# https://stackoverflow.com/a/5137509/7589775 -dir_path = os.path.dirname(os.path.realpath(__file__)) - -raw_dir = Path(dir_path, "..", "raw") - - -def main(): - # We only need the text mining and knowledge channels - # and avoid the integrated channel as it is the multiplied probabilities of all - # three channels (personal correspondence with Damian Szklarczyk) - - raw_dir.mkdir(exist_ok=True) - - print("Fetching DISEASES text channel...") - get_cache_item(["DISEASES", "human_disease_textmining_filtered.tsv"]).download(raw_dir / "human_disease_textmining_filtered.tsv") - - print("Fetching DISEASES knowledge channel...") - get_cache_item(["DISEASES", "human_disease_knowledge_filtered.tsv"]).download(raw_dir / "human_disease_knowledge_filtered.tsv") - - print("Fetching TIGA data...") - get_cache_item(["DISEASES", "tiga_gene-trait_stats.tsv"]).download(raw_dir / "tiga_gene-trait_stats.tsv") - - print("Fetching human disease ontology data...") - get_cache_item(["DISEASES", "HumanDO.tsv"]).download(raw_dir / "HumanDO.tsv") - - print("Fetching BioMart ENSG - ENSP mapping...") - get_cache_item(["BioMart", "ensg-ensp.tsv"]).download(raw_dir / "ensg-ensp.tsv") - - -if __name__ == "__main__": - main() diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py index 2897c94..dc5a949 100644 --- a/datasets/diseases/scripts/files.py +++ b/datasets/diseases/scripts/files.py @@ -40,9 +40,9 @@ def main(): df = df[["str_id"]] df.to_csv(diseases_path / "GS_files" / f"{disease.replace(' ', '_')}_GS.txt", sep="\t", index=False, header=None) - # See /databases/stringdb.py for information on how this was grabbed. + # See /cache/directory.py for information on how this was grabbed. # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES. - string = pd.read_csv(diseases_path / ".." / ".." / "databases" / "string" / "9606.protein.links.v12.0.txt", sep=" ", skiprows=[0], header=None) + string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.txt", sep=" ", skiprows=[0], header=None) # Threshold anything above a confidence score of 900 to trim down the background interactome string = string[string.iloc[:, 2] > 900] diff --git a/datasets/diseases/scripts/gold_standard.py b/datasets/diseases/scripts/gold_standard.py index 1d3ee5f..a60b20c 100644 --- a/datasets/diseases/scripts/gold_standard.py +++ b/datasets/diseases/scripts/gold_standard.py @@ -65,9 +65,7 @@ def main(): # NOTE: the STRING API call to map genes to proteins # also does text search, which brings up more false positives than true positives: because # of this, we specifically only care about ENSG -> ENSP and nothing greater. - string_aliases = pd.read_csv( - diseases_path / ".." / ".." / "databases" / "string" / "9606.protein.aliases.v12.0.txt", sep="\t", usecols=["#string_protein_id", "alias"] - ) + string_aliases = pd.read_csv(diseases_path / "raw" / "9606.protein.aliases.txt", sep="\t", usecols=["#string_protein_id", "alias"]) string_aliases.columns = ["str_id", "ENSP"] string_aliases = string_aliases.drop_duplicates() diff --git a/datasets/diseases/scripts/inputs.py b/datasets/diseases/scripts/inputs.py index 18608f2..ba35396 100644 --- a/datasets/diseases/scripts/inputs.py +++ b/datasets/diseases/scripts/inputs.py @@ -24,9 +24,7 @@ def main(): # Mapping ENSG IDs to STRING IDs through the STRING aliases file # given our ENSG and ENSP (non one-to-one!) mapping `string_aliases`, - string_aliases = pd.read_csv( - diseases_path / ".." / ".." / "databases" / "string" / "9606.protein.aliases.v12.0.txt", sep="\t", usecols=["#string_protein_id", "alias"] - ) + string_aliases = pd.read_csv(diseases_path / "raw" / "9606.protein.aliases.txt", sep="\t", usecols=["#string_protein_id", "alias"]) string_aliases.columns = ["str_id", "ENSP"] string_aliases = string_aliases.drop_duplicates() diff --git a/datasets/hiv/Scripts/Data_Prep.py b/datasets/hiv/Scripts/Data_Prep.py index 1aca432..43460cf 100644 --- a/datasets/hiv/Scripts/Data_Prep.py +++ b/datasets/hiv/Scripts/Data_Prep.py @@ -2,30 +2,24 @@ import pickle import os -prize_05 = pandas.read_csv('raw/prize_05.csv', sep='\t', lineterminator='\n') -prize_060 = pandas.read_csv('raw/prize_060.csv', sep='\t', lineterminator='\n') +prize_05 = pandas.read_csv("raw/prize_05.csv", sep="\t", lineterminator="\n") +prize_060 = pandas.read_csv("raw/prize_060.csv", sep="\t", lineterminator="\n") -prize_05['Uniprot'] = prize_05['Uniprot'].str.split('-', expand=False).str[0] -prize_05 = prize_05.sort_values('Prize', - ascending=False).drop_duplicates('Uniprot').sort_index() +prize_05["Uniprot"] = prize_05["Uniprot"].str.split("-", expand=False).str[0] +prize_05 = prize_05.sort_values("Prize", ascending=False).drop_duplicates("Uniprot").sort_index() -prize_060['Uniprot'] = prize_060['Uniprot'].str.split('-', expand=False).str[0] -prize_060 = prize_060.sort_values('Prize', - ascending=False).drop_duplicates('Uniprot').sort_index() +prize_060["Uniprot"] = prize_060["Uniprot"].str.split("-", expand=False).str[0] +prize_060 = prize_060.sort_values("Prize", ascending=False).drop_duplicates("Uniprot").sort_index() -prize_060_nodes = prize_060['Uniprot'].tolist() -prize_05_nodes = prize_05['Uniprot'].tolist() +prize_060_nodes = prize_060["Uniprot"].tolist() +prize_05_nodes = prize_05["Uniprot"].tolist() -nodeset = list(set(prize_05_nodes+prize_060_nodes)) +nodeset = list(set(prize_05_nodes + prize_060_nodes)) -df = { - "NodeIDs": nodeset, - "prize_05": prize_05, - "prize_060": prize_060 -} +df = {"NodeIDs": nodeset, "prize_05": prize_05, "prize_060": prize_060} -if not os.path.exists('./Pickles'): - os.makedirs('./Pickles') +if not os.path.exists("./Pickles"): + os.makedirs("./Pickles") -with open("Pickles/NodeIDs.pkl","wb") as file: - pickle.dump(df,file) +with open("Pickles/NodeIDs.pkl", "wb") as file: + pickle.dump(df, file) diff --git a/datasets/hiv/Scripts/Kegg_Orthology.py b/datasets/hiv/Scripts/Kegg_Orthology.py index 301e340..919e442 100644 --- a/datasets/hiv/Scripts/Kegg_Orthology.py +++ b/datasets/hiv/Scripts/Kegg_Orthology.py @@ -3,62 +3,59 @@ import pandas as pd from more_itertools import chunked -pathway = read(open("Raw_Data/ko03250.xml", 'r')) +pathway = read(open("Raw_Data/ko03250.xml", "r")) -#Read in Kegg pathway data and keep only orthologs +# Read in Kegg pathway data and keep only orthologs entries_data = [] for entry in pathway.entries.values(): - if entry.type == 'ortholog': - entries_data.append({ - 'name': entry.name - }) + if entry.type == "ortholog": + entries_data.append({"name": entry.name}) entries_df = pd.DataFrame(entries_data) -#Some orthologs have multiple ko codes in the same row -#The following two lines move all ko codes to individual rows -orthology_ids = entries_df['name'].str.split(' ').explode() -orthology_ids = orthology_ids.apply(lambda x: x.split(':')[1]).tolist() +# Some orthologs have multiple ko codes in the same row +# The following two lines move all ko codes to individual rows +orthology_ids = entries_df["name"].str.split(" ").explode() +orthology_ids = orthology_ids.apply(lambda x: x.split(":")[1]).tolist() -#Using bioservices KEGG class to map ortholog(ko) codes to human(hsa) codes +# Using bioservices KEGG class to map ortholog(ko) codes to human(hsa) codes k = KEGG() -ko_hsa_map = k.link('hsa', '+'.join(orthology_ids)) -ko_hsa_dict = {x.split('\t')[0].split(':')[1]: x.split('\t')[1] for x in ko_hsa_map.split('\n')[:-1]} -ko_hsa_df = pd.DataFrame(ko_hsa_dict.items(),columns= ['KEGG_Orthology','HSA']) - -#Kegg .get is limited to 10 entries per call -#The following code chunks the hsa list into sets of 10 -#then calls the .get function on each which returns kegg api data in string format -hsa_chunked = list(chunked(ko_hsa_df['HSA'].tolist(),10)) +ko_hsa_map = k.link("hsa", "+".join(orthology_ids)) +ko_hsa_dict = {x.split("\t")[0].split(":")[1]: x.split("\t")[1] for x in ko_hsa_map.split("\n")[:-1]} +ko_hsa_df = pd.DataFrame(ko_hsa_dict.items(), columns=["KEGG_Orthology", "HSA"]) + +# Kegg .get is limited to 10 entries per call +# The following code chunks the hsa list into sets of 10 +# then calls the .get function on each which returns kegg api data in string format +hsa_chunked = list(chunked(ko_hsa_df["HSA"].tolist(), 10)) raw_uniprot = [] for entry in hsa_chunked: - raw_uniprot.append(k.get('+'.join(entry)).split('\n///\n\n')) + raw_uniprot.append(k.get("+".join(entry)).split("\n///\n\n")) -#Raw Kegg api data is filtered to obtain hsa and uniprot codes for each protein -#Note: Although bioservices .link and .conv return cleaner outputs, they do not support -#one to many relationships at this time. -#Note: bioservices also supplies a parser method for the kegg api but it is also broken at this time. +# Raw Kegg api data is filtered to obtain hsa and uniprot codes for each protein +# Note: Although bioservices .link and .conv return cleaner outputs, they do not support +# one to many relationships at this time. +# Note: bioservices also supplies a parser method for the kegg api but it is also broken at this time. processed_uniprot = [] for chunk in raw_uniprot: for item in chunk: - item = item.split('\n') - processed_uniprot.append([(x.strip().split(' ')[1:],'hsa:'+(item[0].split(' '*7)[1])) - for x in item if 'UniProt' in x][0]) + item = item.split("\n") + processed_uniprot.append([(x.strip().split(" ")[1:], "hsa:" + (item[0].split(" " * 7)[1])) for x in item if "UniProt" in x][0]) -#Creates a dictionary where uniprot ids are keys and hsa ids are values +# Creates a dictionary where uniprot ids are keys and hsa ids are values hsa_uniprot_dict = {} -for item in processed_uniprot : - for entry in item[0]: - hsa_uniprot_dict.update({'up:'+entry:item[1]}) +for item in processed_uniprot: + for entry in item[0]: + hsa_uniprot_dict.update({"up:" + entry: item[1]}) -#Creates a dataframe with uniprot and hsa values then merges with ko-hsa dataframe by hsa +# Creates a dataframe with uniprot and hsa values then merges with ko-hsa dataframe by hsa hsa_uniprot_map = pd.DataFrame.from_dict(hsa_uniprot_dict.items()) -hsa_uniprot_map.columns = ['Uniprot','HSA'] -final_df = ko_hsa_df.merge(hsa_uniprot_map,on = 'HSA') -uniprotIDs = final_df['Uniprot'].apply(lambda x: x.split(':')[1]).tolist() +hsa_uniprot_map.columns = ["Uniprot", "HSA"] +final_df = ko_hsa_df.merge(hsa_uniprot_map, on="HSA") +uniprotIDs = final_df["Uniprot"].apply(lambda x: x.split(":")[1]).tolist() -#Filters the combined dataframe to include only rows where the uniprot id is in swissprot +# Filters the combined dataframe to include only rows where the uniprot id is in swissprot u = UniProt() -tst = u.mapping(fr='UniProtKB', to='UniProtKB-Swiss-Prot',query = ','.join(uniprotIDs)) -failed_uniprot = pd.Series(list(set(tst['failedIds']))).apply(lambda x: 'up:'+x) +tst = u.mapping(fr="UniProtKB", to="UniProtKB-Swiss-Prot", query=",".join(uniprotIDs)) +failed_uniprot = pd.Series(list(set(tst["failedIds"]))).apply(lambda x: "up:" + x) -final_df = final_df[~final_df['Uniprot'].isin(failed_uniprot)] +final_df = final_df[~final_df["Uniprot"].isin(failed_uniprot)] diff --git a/datasets/hiv/Scripts/Name_Mapping.py b/datasets/hiv/Scripts/Name_Mapping.py index 9cace88..6fb6e6a 100644 --- a/datasets/hiv/Scripts/Name_Mapping.py +++ b/datasets/hiv/Scripts/Name_Mapping.py @@ -17,36 +17,28 @@ def main(): - - with open('Pickles/NodeIDs.pkl', 'rb') as file: + with open("Pickles/NodeIDs.pkl", "rb") as file: NodeIDs = pickle.load(file)["NodeIDs"] - job_id = submit_id_mapping( - from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids= NodeIDs - ) + job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=NodeIDs) if check_id_mapping_results_ready(job_id): link = get_id_mapping_results_link(job_id) uniprot_results = get_id_mapping_results_search(link) uniprot_IDs = [] uniprot_map = {} - for i in uniprot_results.get('results'): + for i in uniprot_results.get("results"): uniprot_IDs.append((i.get("to").get("uniProtkbId"))) - uniprot_map.update({i.get("from"):i.get("to").get("uniProtkbId")}) + uniprot_map.update({i.get("from"): i.get("to").get("uniProtkbId")}) - df ={ - "UniprotIDs": uniprot_IDs, - "UniprotMap": uniprot_map - } + df = {"UniprotIDs": uniprot_IDs, "UniprotMap": uniprot_map} - with open("Pickles/UniprotIDs.pkl","wb") as file: - pickle.dump(df,file) + with open("Pickles/UniprotIDs.pkl", "wb") as file: + pickle.dump(df, file) return - - def check_response(response): try: response.raise_for_status() @@ -169,9 +161,7 @@ def get_id_mapping_results_search(url): else: size = 500 query["size"] = size - compressed = ( - query["compressed"][0].lower() == "true" if "compressed" in query else False - ) + compressed = query["compressed"][0].lower() == "true" if "compressed" in query else False parsed = parsed._replace(query=urlencode(query, doseq=True)) url = parsed.geturl() request = session.get(url) @@ -195,11 +185,9 @@ def get_id_mapping_results_stream(url): parsed = urlparse(url) query = parse_qs(parsed.query) file_format = query["format"][0] if "format" in query else "json" - compressed = ( - query["compressed"][0].lower() == "true" if "compressed" in query else False - ) + compressed = query["compressed"][0].lower() == "true" if "compressed" in query else False return decode_results(request, file_format, compressed) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/datasets/hiv/Scripts/SPRAS_Formatting.py b/datasets/hiv/Scripts/SPRAS_Formatting.py index 837f52b..eb8fd99 100644 --- a/datasets/hiv/Scripts/SPRAS_Formatting.py +++ b/datasets/hiv/Scripts/SPRAS_Formatting.py @@ -3,26 +3,26 @@ import os current_directory = Path(os.path.dirname(os.path.realpath(__file__))) -PROCESSED_DIR = current_directory.parent / 'processed' +PROCESSED_DIR = current_directory.parent / "processed" -with open('Pickles/UniprotIDs.pkl', 'rb') as file: - UniprotIDs = pickle.load(file) +with open("Pickles/UniprotIDs.pkl", "rb") as file: + UniprotIDs = pickle.load(file) UIDs = UniprotIDs["UniprotIDs"] -UMap= UniprotIDs["UniprotMap"] +UMap = UniprotIDs["UniprotMap"] -with open('Pickles/NodeIDs.pkl','rb') as file2: - prizes = pickle.load(file2) +with open("Pickles/NodeIDs.pkl", "rb") as file2: + prizes = pickle.load(file2) prize_05 = prizes["prize_05"] prize_060 = prizes["prize_060"] -prize_05['Uniprot'] = prize_05['Uniprot'].apply(lambda x: UMap.get(x)) -prize_060['Uniprot'] = prize_060['Uniprot'].apply(lambda x: UMap.get(x)) +prize_05["Uniprot"] = prize_05["Uniprot"].apply(lambda x: UMap.get(x)) +prize_060["Uniprot"] = prize_060["Uniprot"].apply(lambda x: UMap.get(x)) -prize_05.columns = ['NODEID','prize'] -prize_060.columns = ['NODEID','prize'] +prize_05.columns = ["NODEID", "prize"] +prize_060.columns = ["NODEID", "prize"] -prize_05.to_csv(PROCESSED_DIR / 'processed_prize_05.txt', sep='\t', header=True, index=False) -prize_060.to_csv(PROCESSED_DIR / 'processed_prize_060.txt', sep='\t', header=True, index=False) +prize_05.to_csv(PROCESSED_DIR / "processed_prize_05.txt", sep="\t", header=True, index=False) +prize_060.to_csv(PROCESSED_DIR / "processed_prize_060.txt", sep="\t", header=True, index=False) diff --git a/datasets/rn-muscle-skeletal/process.py b/datasets/rn-muscle-skeletal/process.py index ac56cc2..0119b84 100644 --- a/datasets/rn-muscle-skeletal/process.py +++ b/datasets/rn-muscle-skeletal/process.py @@ -3,18 +3,21 @@ import os current_directory = Path(os.path.dirname(os.path.realpath(__file__))) -PROCESSED_DIR = current_directory / 'processed' +PROCESSED_DIR = current_directory / "processed" + def process(): # TODO: what are the actual last two headers called? - data = pandas.read_csv(current_directory / 'raw' / 'Muscle_Skeletal-Dec2018.tsv', - delimiter='\t', header=None, - names=["Interactome1", "Interactome2", "Type1", - "Type2", "InteractionType", "Weight", - "Const1", "Const2"]) + data = pandas.read_csv( + current_directory / "raw" / "Muscle_Skeletal-Dec2018.tsv", + delimiter="\t", + header=None, + names=["Interactome1", "Interactome2", "Type1", "Type2", "InteractionType", "Weight", "Const1", "Const2"], + ) data = data.drop(columns=["Type1", "Type2", "InteractionType", "Const1", "Const2"]) data.insert(3, "Direction", "U") - data.to_csv(PROCESSED_DIR / 'interactome.tsv', sep='\t', header=False, index=False) + data.to_csv(PROCESSED_DIR / "interactome.tsv", sep="\t", header=False, index=False) + -if __name__ == '__main__': +if __name__ == "__main__": process() diff --git a/datasets/yeast-osmotic-stress/process_prizes.py b/datasets/yeast-osmotic-stress/process_prizes.py index 81682a6..708c1df 100644 --- a/datasets/yeast-osmotic-stress/process_prizes.py +++ b/datasets/yeast-osmotic-stress/process_prizes.py @@ -5,18 +5,15 @@ current_directory = Path(os.path.dirname(os.path.realpath(__file__))) -if __name__ == '__main__': +if __name__ == "__main__": # Get the raw prizes DF - prizes = current_directory / 'raw' / 'prizes.txt' - prizes_df = pd.read_csv(prizes, sep='\t', header=None, names=["NODEID", "prize"]) + prizes = current_directory / "raw" / "prizes.txt" + prizes_df = pd.read_csv(prizes, sep="\t", header=None, names=["NODEID", "prize"]) # Use the manually curated prize info # TODO: where did this come from? - prizes_df2 = pd.DataFrame(data={"NODEID": ['YGR014W','YDR420W','YER118C'], - "prize": 10.051863}, index=[1596,1597,1598]) + prizes_df2 = pd.DataFrame(data={"NODEID": ["YGR014W", "YDR420W", "YER118C"], "prize": 10.051863}, index=[1596, 1597, 1598]) - new_prizes_path = current_directory / 'processed' / 'prizes1_dummies.txt' + new_prizes_path = current_directory / "processed" / "prizes1_dummies.txt" new_prizes = pd.concat([prizes_df, prizes_df2]) - new_prizes.to_csv(new_prizes_path, sep='\t', index=False, - columns=['NODEID','prize'], - header=['NODEID','prize']) + new_prizes.to_csv(new_prizes_path, sep="\t", index=False, columns=["NODEID", "prize"], header=["NODEID", "prize"])