|
| 1 | +# ------------------------------------------------------------------------------------------------- |
| 2 | +# Copyright (c) 2023, DHS. |
| 3 | +# |
| 4 | +# This file is part of MicroHapDB (http://github.com/bioforensics/MicroHapDB) and is licensed under |
| 5 | +# the BSD license: see LICENSE.txt. |
| 6 | +# |
| 7 | +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National |
| 8 | +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the |
| 9 | +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and |
| 10 | +# Development Center. |
| 11 | +# ------------------------------------------------------------------------------------------------- |
| 12 | + |
| 13 | +import json |
| 14 | +import os |
| 15 | +import pandas as pd |
| 16 | +from tqdm import tqdm |
| 17 | +from urllib.request import urlretrieve |
| 18 | + |
| 19 | +accessions = {37: "GCF_000001405.25", 38: "GCF_000001405.40"} |
| 20 | + |
| 21 | + |
| 22 | +def progress(t): |
| 23 | + """Stolen shamelessly from the tqdm documentation""" |
| 24 | + last_b = [0] |
| 25 | + |
| 26 | + def inner(b=1, bsize=1, tsize=None): |
| 27 | + if tsize is not None: |
| 28 | + t.total = tsize |
| 29 | + t.update((b - last_b[0]) * bsize) |
| 30 | + last_b[0] = b |
| 31 | + |
| 32 | + return inner |
| 33 | + |
| 34 | + |
| 35 | +rule all: |
| 36 | + input: |
| 37 | + expand( |
| 38 | + "dbSNP/dbSNP_GRCh{version}.{extension}", |
| 39 | + version=(37, 38), |
| 40 | + extension=("vcf.gz", "vcf.gz.tbi"), |
| 41 | + ), |
| 42 | + "dbSNP/refsnp-merged.csv.gz", |
| 43 | + expand("{contrast}.over.chain.gz", contrast=("hg19ToHg38", "hg38ToHg19")), |
| 44 | + |
| 45 | + |
| 46 | +rule dbsnp: |
| 47 | + output: |
| 48 | + path="dbSNP/dbSNP_GRCh{version}.{extension}", |
| 49 | + wildcard_constraints: |
| 50 | + version="\d+", |
| 51 | + run: |
| 52 | + accession = accessions[int(wildcards.version)] |
| 53 | + url_ext = wildcards.extension.replace("vcf.", "") |
| 54 | + url = f"https://ftp.ncbi.nih.gov/snp/archive/b156/VCF/{accession}.{url_ext}" |
| 55 | + origpath = Path("dbSNP") / Path(url).name |
| 56 | + with tqdm(unit="B", unit_scale=True, leave=True, miniters=1, desc=str(origpath)) as t: |
| 57 | + urlretrieve(url, origpath, reporthook=progress(t)) |
| 58 | + os.symlink(origpath.name, Path(output.path).name, dir_fd=os.open("dbSNP", os.O_RDONLY)) |
| 59 | + |
| 60 | + |
| 61 | +rule merged: |
| 62 | + output: |
| 63 | + path="dbSNP/refsnp-merged.csv.gz", |
| 64 | + run: |
| 65 | + url = "https://ftp.ncbi.nih.gov/snp/archive/b156/JSON/refsnp-merged.json.bz2" |
| 66 | + json_path = "dbSNP/refsnp-merged.json.bz2" |
| 67 | + urlretrieve(url, json_path) |
| 68 | + shell(f"bunzip2 {json_path}") |
| 69 | + merged_rsids = dict() |
| 70 | + updateint = 1e6 |
| 71 | + threshold = updateint |
| 72 | + with open("dbSNP/refsnp-merged.json", "r") as instream: |
| 73 | + for n, line in enumerate(instream): |
| 74 | + try: |
| 75 | + data = json.loads(line) |
| 76 | + except Exception: |
| 77 | + warn(f"Could not parse line {n+1}, skipping: {line}") |
| 78 | + continue |
| 79 | + source = data["refsnp_id"] |
| 80 | + targets = data["merged_snapshot_data"]["merged_into"] |
| 81 | + for target in targets: |
| 82 | + merged_rsids[f"rs{source}"] = f"rs{target}" |
| 83 | + if n >= threshold: |
| 84 | + threshold += updateint |
| 85 | + if threshold == updateint * 10: |
| 86 | + updateint = threshold |
| 87 | + print(f"processed {n} rows") |
| 88 | + table = pd.DataFrame(merged_rsids.items(), columns=["Source", "Target"]) |
| 89 | + table.to_csv(output.path, index=False, compression="gzip") |
| 90 | + |
| 91 | + |
| 92 | +rule chain: |
| 93 | + output: |
| 94 | + path="{contrast}.over.chain.gz", |
| 95 | + run: |
| 96 | + source = wildcards.contrast[:4] |
| 97 | + url = f"https://hgdownload.cse.ucsc.edu/goldenpath/{source}/liftOver/{wildcards.contrast}.over.chain.gz" |
| 98 | + urlretrieve(url, output.path) |
0 commit comments