Reed-CompBio
diff --git a/‎cache/__init__.py‎
Lines changed: 9 additions & 6 deletions b/‎cache/__init__.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎cache/cli.py‎
Lines changed: 6 additions & 5 deletions b/‎cache/cli.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎cache/directory.py‎
Lines changed: 42 additions & 38 deletions b/‎cache/directory.py‎
Lines changed: 42 additions & 38 deletions
diff --git a/‎datasets/contributing/raw_generation.py‎
Lines changed: 9 additions & 3 deletions b/‎datasets/contributing/raw_generation.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎datasets/diseases/scripts/interactome.py‎
Lines changed: 2 additions & 0 deletions b/‎datasets/diseases/scripts/interactome.py‎
Lines changed: 2 additions & 0 deletions
@@ -14,9 +14,11 @@
 dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
 artifacts_dir = dir_path / "artifacts"
 
+
 def get_artifact_name(directive: list[str]) -> str:
     return quote_plus("/".join(directive))
 
+
 def has_expired(directive: list[str]) -> bool:
     """
     Check if the artifact metadata associated with a directive has expired.
@@ -25,29 +27,30 @@ def has_expired(directive: list[str]) -> bool:
     artifact_name = get_artifact_name(directive)
     cache_item = get_cache_item(directive)
 
-    metadata_dir = artifacts_dir / 'metadata'
+    metadata_dir = artifacts_dir / "metadata"
     metadata_dir.mkdir(exist_ok=True)
-    metadata_file = (artifacts_dir / 'metadata' / artifact_name).with_suffix((artifacts_dir / artifact_name).suffix + '.metadata')
+    metadata_file = (artifacts_dir / "metadata" / artifact_name).with_suffix((artifacts_dir / artifact_name).suffix + ".metadata")
 
     # metadata never existed: we need to retrieve the new file
     if not metadata_file.exists():
-        with open(metadata_file, 'wb') as f:
+        with open(metadata_file, "wb") as f:
             pickle.dump(cache_item, f)
         return True
 
     old_cache_item = None
-    with open(metadata_file, 'rb') as f:
+    with open(metadata_file, "rb") as f:
         old_cache_item = pickle.load(f)
 
     # metadata expired: re-retrieve the item
     if old_cache_item != cache_item:
-        with open(metadata_file, 'wb') as f:
+        with open(metadata_file, "wb") as f:
             pickle.dump(cache_item, f)
         return True
 
     # metadata hasn't changed and already existed: this hasn't expired
     return False
 
+
 def link(output: str, directive: list[str], uncompress=False):
     """
     Links output files from cache.directory directives.
@@ -74,7 +77,7 @@ def link(output: str, directive: list[str], uncompress=False):
         cache_item.download(artifacts_dir / artifact_name)
 
     if uncompress:
-        uncompressed_artifact_path = Path(str(artifacts_dir / artifact_name) + '.uncompressed')
+        uncompressed_artifact_path = Path(str(artifacts_dir / artifact_name) + ".uncompressed")
         uncompressed_artifact_path.unlink(missing_ok=True)
         uncompress_file(artifacts_dir / artifact_name, uncompressed_artifact_path)
         Path(output).symlink_to(uncompressed_artifact_path)
 
@@ -10,20 +10,21 @@
 import argparse
 from cache.directory import get_cache_item
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(
-        prog='Cache',
-        description='CLI utility for directory.py')
-    parser.add_argument('path')
-    parser.add_argument('output')
+    parser = argparse.ArgumentParser(prog="Cache", description="CLI utility for directory.py")
+    parser.add_argument("path")
+    parser.add_argument("output")
 
     return parser.parse_args()
 
+
 def main():
     args = parse_args()
     cache_item = get_cache_item(args.path.split("/"))
 
     cache_item.download(args.output)
 
+
 if __name__ == "__main__":
     main()
@@ -16,12 +16,14 @@
 dir_path = Path(__file__).parent.resolve()
 
 # Our cache emits warnings for files with unpinned versions that don't match the cache.
-(dir_path / 'logs').mkdir(exist_ok=True)
-logger.add(dir_path / 'logs' / "cache.log", level="WARNING")
+(dir_path / "logs").mkdir(exist_ok=True)
+logger.add(dir_path / "logs" / "cache.log", level="WARNING")
+
 
 class DownloadFileCheckException(RuntimeError):
     """See Service#download_against_cache for some motivation for this custom error"""
 
+
 @dataclass
 class Service:
     url: str
@@ -34,17 +36,12 @@ def download(self, output: str | PathLike) -> requests.Response:
         # As per https://stackoverflow.com/a/39217788/7589775 to enable download streaming.
         with requests.get(self.url, stream=True, headers=self.headers) as response:
             response.raw.decode_content = True
-            with open(output, 'wb') as f:
+            with open(output, "wb") as f:
                 shutil.copyfileobj(response.raw, f)
             return response
 
     # NOTE: this is slightly yucky code deduplication. The only intended values of `downloaded_file_type` are `pinned` and `unpinned`.
-    def download_against_cache(
-            self,
-            cache: Path,
-            downloaded_file_type: str,
-            move_output: bool
-        ):
+    def download_against_cache(self, cache: Path, downloaded_file_type: str, move_output: bool):
         """
         Downloads `this` Service and checks it against the provided `cache` at path. In logs,
         the file will be referred to as `downloaded_file_type`.
@@ -68,21 +65,24 @@ def download_against_cache(
             else:
                 shutil.copy(cache, debug_file_path)
             # We use a custom error type to prevent any overlap with RuntimeError. I am not sure if there is any.
-            raise DownloadFileCheckException(f"The {downloaded_file_type} file {downloaded_file_path} and " + \
-                                             f"cached file originally at {cache} do not match! " + \
-                                             f"Compare the pinned {downloaded_file_path} and the cached {debug_file_path}.")
+            raise DownloadFileCheckException(
+                f"The {downloaded_file_type} file {downloaded_file_path} and "
+                + f"cached file originally at {cache} do not match! "
+                + f"Compare the pinned {downloaded_file_path} and the cached {debug_file_path}."
+            )
         else:
             # Since we don't clean up pinned_file_path for the above branch's debugging,
             # we need to clean it up here.
             downloaded_file_path.unlink()
 
     @staticmethod
-    def coerce(obj: 'Service | str') -> 'Service':
+    def coerce(obj: "Service | str") -> "Service":
         # TODO: This could also be replaced by coercing str to Service in CacheItem via pydantic.
         if isinstance(obj, str):
             return Service(url=obj)
         return obj
 
+
 def fetch_biomart_service(xml: str) -> Service:
     """
     Access BioMart data through the BioMart REST API:
@@ -91,6 +91,7 @@ def fetch_biomart_service(xml: str) -> Service:
     ROOT = "http://www.ensembl.org/biomart/martservice?query="
     return Service(ROOT + urllib.parse.quote_plus(xml))
 
+
 @dataclass
 class CacheItem:
     """
@@ -136,7 +137,7 @@ def download(self, output: str | PathLike):
         logger.info(f"Fetching {self.name}...")
 
         logger.info(f"Downloading cache {self.cached} to {output}...")
-        gdown.download(self.cached, str(output)) # gdown doesn't have a type signature, but it expects a string :/
+        gdown.download(self.cached, str(output))  # gdown doesn't have a type signature, but it expects a string :/
 
         if self.pinned is not None:
             Service.coerce(self.pinned).download_against_cache(cache=Path(output), downloaded_file_type="pinned", move_output=True)
@@ -148,6 +149,8 @@ def download(self, output: str | PathLike):
                 logger.warning(err)
 
         # TODO: yikes! same with self.unpinned
+
+
 CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]]
 
 # An *unversioned* directory list.
@@ -255,8 +258,8 @@ def download(self, output: str | PathLike):
             name="KEGG 03250",
             cached="https://drive.google.com/uc?id=16dtWKHCQMp2qrLfFDE7nVhbwBCr2H5a9",
             unpinned=Service(
-                "https://www.kegg.jp/kegg-bin/download?entry=ko03250&format=kgml",
-                headers={'Referer': 'https://www.kegg.jp/pathway/ko03250'})
+                "https://www.kegg.jp/kegg-bin/download?entry=ko03250&format=kgml", headers={"Referer": "https://www.kegg.jp/pathway/ko03250"}
+            ),
         )
     },
     "HIV1": {
@@ -266,13 +269,13 @@ def download(self, output: str | PathLike):
         "prize_05.tsv": CacheItem(
             name="HIV_05 prizes",
             cached="https://drive.google.com/uc?id=1jVWNRPfYkbqimO44GdzXYB3-7NXhet1m",
-            pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_05.csv"
+            pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_05.csv",
         ),
         "prize_060.tsv": CacheItem(
             name="HIV_060 prizes",
             cached="https://drive.google.com/uc?id=1Aucgp7pcooGr9oT4m2bvYEuYW6186WxQ",
-            pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_060.csv"
-        )
+            pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_060.csv",
+        ),
     },
     "iRefIndex": {
         # This can also be obtained from the SPRAS repo, though the SPRAS repo removes self loops. We don't.
@@ -283,42 +286,42 @@ def download(self, output: str | PathLike):
         "phosphosite-irefindex13.0-uniprot.txt": CacheItem(
             name="iRefIndex v13.0 UniProt interactome",
             cached="https://drive.google.com/uc?id=1fQ8Z3FjEwUseEtsExO723zj7mAAtdomo",
-            pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/networks/phosphosite-irefindex13.0-uniprot.txt"
+            pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/networks/phosphosite-irefindex13.0-uniprot.txt",
         )
     },
     "OsmoticStress": {
         "yeast_pcsf_network.sif": CacheItem(
             # In the paper https://doi.org/10.1016/j.celrep.2018.08.085
             name="Case Study Edge Results, from Supplementary Data 3",
-            cached="https://drive.google.com/uc?id=1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h"
+            cached="https://drive.google.com/uc?id=1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h",
         ),
         # The following files are from https://github.com/gitter-lab/osmotic-stress.
         # While the following files do point to the repository's main branch,
         # they aren't expected to actually change.
         "prizes.txt": CacheItem(
             name="Osmotic Stress Prizes",
             pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt",
-            cached="https://drive.google.com/uc?id=16WDQs0Vjv6rI12-hbifsbnpH31jMGhJg"
+            cached="https://drive.google.com/uc?id=16WDQs0Vjv6rI12-hbifsbnpH31jMGhJg",
         ),
         "ChasmanNetwork-DirUndir.txt": CacheItem(
             name="Network Input",
             pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt",
-            cached="https://drive.google.com/uc?id=1qYXPaWcPU72YYME7NaBzD7thYCHRzrLH"
+            cached="https://drive.google.com/uc?id=1qYXPaWcPU72YYME7NaBzD7thYCHRzrLH",
         ),
         "dummy.txt": CacheItem(
             name="Dummy Nodes File",
             pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt",
-            cached="https://drive.google.com/uc?id=1dsFIhBrIEahggg0JPxw64JwS51pKxoQU"
+            cached="https://drive.google.com/uc?id=1dsFIhBrIEahggg0JPxw64JwS51pKxoQU",
         ),
         "_edgeFreq.eda ": CacheItem(
             name="Case Study Omics Integrator Edge Frequencies",
             pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda",
-            cached="https://drive.google.com/uc?id=1M_rxEzUCo_EVuFyM47OEH2J-4LB3eeCR"
+            cached="https://drive.google.com/uc?id=1M_rxEzUCo_EVuFyM47OEH2J-4LB3eeCR",
         ),
         "goldStandardUnionDetailed.txt": CacheItem(
             name="Gold Standard Reference Pathways",
             pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt",
-            cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T"
+            cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T",
         ),
     },
     "EGFR": {
@@ -328,19 +331,19 @@ def download(self, output: str | PathLike):
         "eight-egfr-reference-all.txt": CacheItem(
             name="EGFR Gold Standard Reference",
             pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt",
-            cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw"
+            cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw",
         ),
         "egfr-prizes.txt": CacheItem(
             name="EGFR prizes",
             pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt",
-            cached="https://drive.google.com/uc?id=1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj"
-        )
+            cached="https://drive.google.com/uc?id=1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj",
+        ),
     },
     "Surfaceome": {
         "table_S3_surfaceome.xlsx": CacheItem(
             name="Human surfaceome",
             unpinned="http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx",
-            cached="https://docs.google.com/uc?id=1cBXYbDnAJVet0lv3BRrizV5FuqfMbBr0"
+            cached="https://docs.google.com/uc?id=1cBXYbDnAJVet0lv3BRrizV5FuqfMbBr0",
         )
     },
     "TranscriptionFactors": {
@@ -357,7 +360,7 @@ def download(self, output: str | PathLike):
         "pc-biopax.owl.gz": CacheItem(
             name="PathwayCommons Universal BioPAX file",
             cached="https://drive.google.com/uc?id=1R7uE2ky7fGlZThIWCOblu7iqbpC-aRr0",
-            pinned="https://download.baderlab.org/PathwayCommons/PC2/v14/pc-biopax.owl.gz"
+            pinned="https://download.baderlab.org/PathwayCommons/PC2/v14/pc-biopax.owl.gz",
         ),
         "pathways.txt.gz": CacheItem(
             name="PathwayCommons Pathway Identifiers",
@@ -367,15 +370,14 @@ def download(self, output: str | PathLike):
         "denylist.txt": CacheItem(
             name="PathwayCommons small molecule denylist",
             cached="https://drive.google.com/uc?id=1QmISJXPvVljA8oKuNYRUNbJJvZKPa_-u",
-            pinned="https://download.baderlab.org/PathwayCommons/PC2/v14/blacklist.txt"
+            pinned="https://download.baderlab.org/PathwayCommons/PC2/v14/blacklist.txt",
         ),
         "intermediate": {
             "pc-panther-biopax.owl": CacheItem(
-                name="PathwayCommons PANTHER-only BioPAX file",
-                cached="https://drive.google.com/uc?id=1MklrD8CJ1BIjh_wWr_g5rrIJ5XJB7FUI"
+                name="PathwayCommons PANTHER-only BioPAX file", cached="https://drive.google.com/uc?id=1MklrD8CJ1BIjh_wWr_g5rrIJ5XJB7FUI"
             )
-        }
-    }
+        },
+    },
 }
 
 
@@ -394,7 +396,9 @@ def get_cache_item(path: list[str]) -> CacheItem:
 
     # Google Drive validation. TODO: remove if move to OSDF.
     if "uc?id=" not in current_item.cached or "/view?usp=sharing" in current_item.cached:
-        raise RuntimeError("Make sure your Google Drive URLs are in https://drive.google.com/uc?id=... format " + \
-                           "with no /view?usp=sharing at the end. See CONTRIBUTING.md for more info.")
+        raise RuntimeError(
+            "Make sure your Google Drive URLs are in https://drive.google.com/uc?id=... format "
+            + "with no /view?usp=sharing at the end. See CONTRIBUTING.md for more info."
+        )
 
     return current_item
@@ -6,14 +6,17 @@
 import uuid
 import pandas
 
+
 def random_id() -> str:
     return uuid.uuid4().hex
 
+
 def assign_ids(graph: networkx.DiGraph) -> networkx.DiGraph:
     """Assigns new IDs to a graph based on `random_id`"""
     mapping = {node: random_id() for node in graph}
     return networkx.relabel_nodes(graph, mapping)
 
+
 def gnp_noise(graph: networkx.DiGraph, p: float):
     """
     The mutative equivalent to networkx.gnp_random_graph,
@@ -23,8 +26,9 @@ def gnp_noise(graph: networkx.DiGraph, p: float):
         if random.random() < p:
             graph.add_edge(*e)
 
+
 def generate_parser():
-    parser = argparse.ArgumentParser(prog='Pathway generator')
+    parser = argparse.ArgumentParser(prog="Pathway generator")
     parser.add_argument("--path-count", type=int, default=10)
     parser.add_argument("--path-length", type=int, default=7)
 
@@ -39,6 +43,7 @@ def generate_parser():
     parser.add_argument("--interactome-output", type=str, default="interactome.tsv")
     return parser
 
+
 def main():
     args = generate_parser().parse_args()
 
@@ -66,13 +71,14 @@ def main():
     gold_standard = pandas.DataFrame(((a, b) for a, b, _data in networkx.to_edgelist(graph)), columns=["Source", "Target"])
     # We make the gold standard output a little annoying to force some post-processing with pandas.
     gold_standard.insert(1, "Interaction-Type", "pp")
-    gold_standard.to_csv(args.gold_standard_output, index=False, sep='\t')
+    gold_standard.to_csv(args.gold_standard_output, index=False, sep="\t")
 
     # and we'll follow along similarly to above to build our interactome.
     graph.add_nodes_from((random_id() for _ in range(args.interactome_extra_nodes)))
     gnp_noise(graph, args.interactome_noise)
     interactome = pandas.DataFrame(((a, b) for a, b, _data in networkx.to_edgelist(graph)), columns=["Source", "Target"])
-    interactome.to_csv(args.interactome_output, index=False, sep='\t')
+    interactome.to_csv(args.interactome_output, index=False, sep="\t")
+
 
 if __name__ == "__main__":
     main()
@@ -3,6 +3,7 @@
 
 diseases_path = Path(__file__).parent.parent.resolve()
 
+
 def main():
     # See /cache/directory.py for information on how this was grabbed.
     # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
@@ -15,5 +16,6 @@ def main():
     (diseases_path / "processed").mkdir(exist_ok=True)
     string.to_csv(diseases_path / "processed" / "string_interactome.tsv", sep="\t", index=False, header=False)
 
+
 if __name__ == "__main__":
     main()