Merge pull request #7 from ruppinlab/feat-fix-data-dependencies

wir963 · web-flow · commit a656444317b9 · 2021-10-30T10:01:44.000-04:00
Feat fix data dependencies
diff --git a/Aulicino2018/config/PathSeq-config.yaml b/Aulicino2018/config/PathSeq-config.yaml
@@ -13,16 +13,16 @@ human_ref:
 
 PathSeq:
   bam_file: "output/star/{patient}-{sample}-{plate}/unaligned.bam"
-  microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
-  microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
-  microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
-  microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
-  taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
-  host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
-  host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
+  microbe_fasta: "../data/microbev1.fa"
+  microbe_fai: "../data/microbev1.fa.fai"
+  microbe_dict: "../data/microbev1.dict"
+  microbe_bwa_image: "../data/microbev1.fa.img"
+  taxonomy_db: "../data/microbev1_release201_taxonomy.db"
+  host_img: "../data/pathseq_host.fa.img"
+  host_bfi: "../data/pathseq_host.bfi"
 
 VecScreen:
-  contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
+  contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
 
 STAR:
   FASTQ_dir: "FASTQ/trimmed"
diff --git a/Ben-Moshe2019/config/PathSeq-config.yaml b/Ben-Moshe2019/config/PathSeq-config.yaml
@@ -5,16 +5,16 @@ units: data/units.tsv
 
 PathSeq:
   bam_file: "output/BAM/{patient}-{sample}-unaligned.bam"
-  microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
-  microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
-  microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
-  microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
-  taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
-  host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
-  host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
+  microbe_fasta: "../data/microbev1.fa"
+  microbe_fai: "../data/microbev1.fa.fai"
+  microbe_dict: "../data/microbev1.dict"
+  microbe_bwa_image: "../data/microbev1.fa.img"
+  taxonomy_db: "../data/microbev1_release201_taxonomy.db"
+  host_img: "../data/pathseq_host.fa.img"
+  host_bfi: "../data/pathseq_host.bfi"
 
 VecScreen:
-  contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
+  contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
 
 params:
   PathSeq: "--min-clipped-read-length 38 --min-base-quality 1 --max-masked-bases 10 --dust-t 24"
diff --git a/Lee2020/config/PathSeq-config.yaml b/Lee2020/config/PathSeq-config.yaml
@@ -8,16 +8,16 @@ trimming:
 
 PathSeq:
   bam_file: "output/BAM/{patient}-{sample}-unaligned.bam"
-  microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
-  microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
-  microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
-  microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
-  taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
-  host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
-  host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
+  microbe_fasta: "../data/microbev1.fa"
+  microbe_fai: "../data/microbev1.fa.fai"
+  microbe_dict: "../data/microbev1.dict"
+  microbe_bwa_image: "../data/microbev1.fa.img"
+  taxonomy_db: "../data/microbev1_release201_taxonomy.db"
+  host_img: "../data/pathseq_host.fa.img"
+  host_bfi: "../data/pathseq_host.bfi"
 
 VecScreen:
-  contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
+  contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
 
 params:
   PathSeq: "--min-clipped-read-length 50 --min-base-quality 1 --max-masked-bases 10 --dust-t 24 "
diff --git a/Maynard2020/config/PathSeq-config.yaml b/Maynard2020/config/PathSeq-config.yaml
@@ -26,13 +26,13 @@ trimming:
 
 PathSeq:
   bam_file: "output/star/{patient}-{sample}-{plate}/unaligned.bam"
-  microbe_fasta: "../Aulicino2018/data/microbev1.fa"
-  microbe_fai: "../Aulicino2018/data/microbev1.fa.fai"
-  microbe_dict: "../Aulicino2018/data/microbev1.dict"
-  microbe_bwa_image: "../Aulicino2018/data/microbev1.fa.img"
-  taxonomy_db: "../Aulicino2018/data/microbev1_release201_taxonomy.db"
-  host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host_ERCC92.img"
-  host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
+  microbe_fasta: "../data/microbev1.fa"
+  microbe_fai: "../data/microbev1.fa.fai"
+  microbe_dict: "../data/microbev1.dict"
+  microbe_bwa_image: "../data/microbev1.fa.img"
+  taxonomy_db: "../data/microbev1_release201_taxonomy.db"
+  host_img: "../data/pathseq_host.fa.img"
+  host_bfi: "../data/pathseq_host.bfi"
 
 VecScreen:
-  contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
+  contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
diff --git a/Paulson2018/config/PathSeq-config.yaml b/Paulson2018/config/PathSeq-config.yaml
@@ -9,16 +9,16 @@ params:
 
 PathSeq:
   bam_file: "output/BAM/{patient}-{sample}-unaligned.bam"
-  microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
-  microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
-  microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
-  microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
-  taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
-  host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
-  host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
+  microbe_fasta: "../data/microbev1.fa"
+  microbe_fai: "../data/microbev1.fa.fai"
+  microbe_dict: "../data/microbev1.dict"
+  microbe_bwa_image: "../data/microbev1.fa.img"
+  taxonomy_db: "../data/microbev1_release201_taxonomy.db"
+  host_img: "../data/pathseq_host.fa.img"
+  host_bfi: "../data/pathseq_host.bfi"
 
 VecScreen:
-  contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
+  contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
 
 CellRanger:
   genome_dir: "$CELLRANGER_REF300/GRCh38"
diff --git a/README.md b/README.md
@@ -42,6 +42,29 @@ Currently, we support three distinct approaches for quantifying the number of re
 
 The workflows assume that the files needed by PathSeq and CAMMiQ are pre-built and their location is specified in `config/PathSeq-config.yaml`. The index used by PathSeq in this project is ~41 GB while the indices used by CAMMiQ are > 200 GB so we do not distribute them with the rest of the package although they are available upon request.
 
+The existing projects assume that these database files will be in the top-level directory so the first step is to create the directory `CSI-Microbes-identification/data` (if it doesn't already exist) using
+
+```
+mkdir data
+```
+
+The PathSeq files can be divided into two groups: the microbial reference files and the host (human) reference files. To download the host reference files (which can take a while), use the below commands
+
+```
+cd data
+wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/pathseq/pathseq_host.tar.gz
+tar -xf pathseq_host.tar.gz
+cd ..
+```
+
+ The microbial reference fasta file (`microbev1.fa`), VecScreen hits (`microbev1-vecscreen-combined-matches.bed`) and taxonomy hierarchy file (`microbev1_release201_taxonomy.db`) used in our paper are available from [zenodo](https://zenodo.org/record/5604433). The microbial reference fasta file (`microbev1.fa`) can be used to build the additional microbial files required by PathSeq (`microbev1.fa.fai`, `microbev1.dict` and `microbev1.fa.img`) using the below command.
+
+ ```
+ ./scripts/run-build-PathSeq-microbe-files.sh
+ ```
+
+ Please note that building the `microbev1.fa.img` file will take ~1 day.
+
 ### Construction of PathSeq files
 
 An example of how to build the PathSeq index (and other required files) is available in `build-PathSeq-microbes-files/Snakefile`.
@@ -360,6 +383,14 @@ to
 }
 ```
 
+### What if I don't have access to Biowulf?
+
+Biowulf is the NIH's linux cluster that uses the slurm workload manager. It should be relatively straightforward to run CSI-Microbes-identification to another linux cluster that uses the slurm workload manager. Users should ensure that there is a module system that includes snakemake (6.0.5)<sup>[REF](#Snakemake)</sup>, sratoolkit (2.10.9)<sup>[REF](#SRAToolkit)</sup>, cellranger (5.0.1)<sup>[REF](#CellRanger)</sup>, samtools (1.11)<sup>[REF](#SAMtools)</sup>, bedtools (2.29.2)<sup>[REF](#BedTools)</sup>, and picard (latest=2.25.0)<sup>[REF](#Picard)</sup>.
+
+Users may also need to change the names of the partitions to the partitions used by their server (see above question for an example). To speed up and effectively parallelize the PathSeq step, it is important to use nodes with at least 200 GB of local storage because we copy the PathSeq files to the local node before running PathSeq. In our experience, running PathSeq on multiple nodes using the same reference files will be much slower because of network latency as well competition for access to the one PathSeq reference files between the nodes (which will also sometimes cause errors).
+
+Currently, the example 10x analyses use an HG38 genome that exists on biowulf. Therefore, the `CellRanger`: `genome_dir` value in `config/PathSeq-config.yaml` will need to updated as well.
+
 ### What are the expected output files?
 
 The expected output files from CSI-Microbes-identification are pathseq.txt files, which are output in `output/PathSeq`. For example, the pathseq file for cell barcode TTTCCTCTCCACTGGG-1 from sample GSM3454529 (exposed to _Salmonella_) is located at `output/PathSeq/Pt0-GSM3454529-TTTCCTCTCCACTGGG-1/pathseq.txt`. These output files are used as input to [CSI-Microbes-analysis](https://github.com/ruppinlab/CSI-Microbes-analysis), which computes the differential abundance of microbes across cell-types.
diff --git a/build-PathSeq-microbe-files/Snakefile b/build-PathSeq-microbe-files/Snakefile
@@ -3,46 +3,44 @@ import pandas as pd
 
 # URLs
 # if not using current release, should be "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/archive/RefSeq-{release}.catalog.gz"
-REFSEQ_CATALOG_URL = "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/archive/RefSeq-{release}.catalog.gz"
+# REFSEQ_CATALOG_URL = "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/archive/RefSeq-{release}.catalog.gz"
 # PathSeq expects taxdump in tar.gz file format - the archived taxdump files are in zip
 # NCBI_TAX_DUMP_URL = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
 
 # Directories
-RAW_DIR = "raw"
-DATA_DIR = "data"
+#RAW_DIR = "raw"
+DATA_DIR = "../data"
 
 # other downloaded files
-REFSEQ_CATALOG = join(RAW_DIR, "RefSeq-{release}.catalog.gz")
-NCBI_TAX_DUMP = join(RAW_DIR, "ncbi_taxdump.tar.gz")
+#REFSEQ_CATALOG = join(RAW_DIR, "RefSeq-{release}.catalog.gz")
+#NCBI_TAX_DUMP = join(RAW_DIR, "ncbi_taxdump.tar.gz")
 
 # output files
-TAXONOMY = join(DATA_DIR, "{microbe}_{release}_taxonomy.db")
-FASTA_FILE = join(DATA_DIR, "{microbe}.fa")
-FASTA_IDX_FILE = join(DATA_DIR, "{microbe}.fa.fai")
-FASTA_DICT_FILE = join(DATA_DIR, "{microbe}.dict")
-BWA_IMAGE_INDEX = join(DATA_DIR, "{microbe}.fa.img")
+TAXONOMY = join(DATA_DIR, "microbev1_release201_taxonomy.db")
+FASTA_FILE = join(DATA_DIR, "microbev1.fa")
+FASTA_IDX_FILE = join(DATA_DIR, "microbev1.fa.fai")
+FASTA_DICT_FILE = join(DATA_DIR, "microbev1.dict")
+BWA_IMAGE_INDEX = join(DATA_DIR, "microbev1.fa.img")
 
 
 rule all:
     input:
-        TAXONOMY.format(microbe="microbev1", release="release201"),
-        FASTA_FILE.format(microbe="microbev1"),
         FASTA_IDX_FILE.format(microbe="microbev1"),
         FASTA_DICT_FILE.format(microbe="microbev1"),
         BWA_IMAGE_INDEX.format(microbe="microbev1"),
 
 
-rule download_combine_subspe_genomes:
-    input:
-        join("data", "microbe_subset_assembly_summary.txt")
-    output:
-        join("data", "microbev1.fa")
-    run:
-        df = pd.read_csv(input[0], sep="\t")
-        shell("touch {output}")
-        for index, row in df.iterrows():
-            url = row["url"]
-            shell("wget -O - {url} | gunzip -c >> {output}")
+# rule download_combine_subspe_genomes:
+#     input:
+#         join("data", "microbe_subset_assembly_summary.txt")
+#     output:
+#         join("data", "microbev1.fa")
+#     run:
+#         df = pd.read_csv(input[0], sep="\t")
+#         shell("touch {output}")
+#         for index, row in df.iterrows():
+#             url = row["url"]
+#             shell("wget -O - {url} | gunzip -c >> {output}")
 
 
 # rules for generating PathSeq data
@@ -54,30 +52,30 @@ rule download_combine_subspe_genomes:
 #     shell:
 #         "wget {params} -O {output}"
 
-rule download_RefSeq_accession_catalog:
-    params:
-        REFSEQ_CATALOG_URL
-    output:
-        REFSEQ_CATALOG
-    shell:
-        "wget {params} -O {output}"
+# rule download_RefSeq_accession_catalog:
+#     params:
+#         REFSEQ_CATALOG_URL
+#     output:
+#         REFSEQ_CATALOG
+#     shell:
+#         "wget {params} -O {output}"
 
-rule build_taxonomy_file:
-    input:
-        fa = FASTA_FILE,
-        catalog = REFSEQ_CATALOG,
-        taxdump = NCBI_TAX_DUMP,
-        fai = FASTA_IDX_FILE,
-        dict = FASTA_DICT_FILE
-    output:
-        TAXONOMY
-    shell:
-        "module load GATK/4.1.8.1 && "
-        "gatk PathSeqBuildReferenceTaxonomy "
-        "-R '{input.fa}' "
-        "--refseq-catalog '{input.catalog}' "
-        "--tax-dump '{input.taxdump}' "
-        "-O '{output}'"
+# rule build_taxonomy_file:
+#     input:
+#         fa = FASTA_FILE,
+#         catalog = REFSEQ_CATALOG,
+#         taxdump = NCBI_TAX_DUMP,
+#         fai = FASTA_IDX_FILE,
+#         dict = FASTA_DICT_FILE
+#     output:
+#         TAXONOMY
+#     shell:
+#         "module load GATK/4.1.8.1 && "
+#         "gatk PathSeqBuildReferenceTaxonomy "
+#         "-R '{input.fa}' "
+#         "--refseq-catalog '{input.catalog}' "
+#         "--tax-dump '{input.taxdump}' "
+#         "-O '{output}'"
 
 rule create_fasta_dict:
     input:
diff --git a/build-PathSeq-microbe-files/config/cluster.json b/build-PathSeq-microbe-files/config/cluster.json
@@ -10,9 +10,8 @@
   "build_BWA_image":
   {
     "nthreads": 2,
-    "mem": "400g",
+    "mem": "100g",
     "gres": 200,
-    "time": "4-00:00:00",
-    "partition": "largemem"
+    "time": "2-00:00:00"
   }
 }
diff --git a/test-10x/config/PathSeq-config.yaml b/test-10x/config/PathSeq-config.yaml
@@ -5,16 +5,16 @@ units: data/units.tsv
 
 PathSeq:
   bam_file: "output/BAM/{patient}-{sample}-unaligned.bam"
-  microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
-  microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
-  microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
-  microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
-  taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
-  host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
-  host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
+  microbe_fasta: "../data/microbev1.fa"
+  microbe_fai: "../data/microbev1.fa.fai"
+  microbe_dict: "../data/microbev1.dict"
+  microbe_bwa_image: "../data/microbev1.fa.img"
+  taxonomy_db: "../data/microbev1_release201_taxonomy.db"
+  host_img: "../data/pathseq_host.fa.img"
+  host_bfi: "../data/pathseq_host.bfi"
 
 VecScreen:
-  contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
+  contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
 
 params:
   PathSeq: "--min-clipped-read-length 38 --min-base-quality 1 --max-masked-bases 10 --dust-t 24"
diff --git a/test-SS2/config/PathSeq-config.yaml b/test-SS2/config/PathSeq-config.yaml
@@ -13,16 +13,16 @@ human_ref:
 
 PathSeq:
   bam_file: "output/star/{patient}-{sample}-{plate}/unaligned.bam"
-  microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
-  microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
-  microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
-  microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
-  taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
-  host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
-  host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
+  microbe_fasta: "../data/microbev1.fa"
+  microbe_fai: "../data/microbev1.fa.fai"
+  microbe_dict: "../data/microbev1.dict"
+  microbe_bwa_image: "../data/microbev1.fa.img"
+  taxonomy_db: "../data/microbev1_release201_taxonomy.db"
+  host_img: "../data/pathseq_host.fa.img"
+  host_bfi: "../data/pathseq_host.bfi"
 
 VecScreen:
-  contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
+  contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
 
 STAR:
   FASTQ_dir: "FASTQ/trimmed"

Original file line number	Diff line number	Diff line change
`@@ -10,9 +10,8 @@`
`10`	`10`	`"build_BWA_image":`
`11`	`11`	`{`
`12`	`12`	`"nthreads": 2,`
`13`		`- "mem": "400g",`
	`13`	`+ "mem": "100g",`
`14`	`14`	`"gres": 200,`
`15`		`- "time": "4-00:00:00",`
`16`		`- "partition": "largemem"`
	`15`	`+ "time": "2-00:00:00"`
`17`	`16`	`}`
`18`	`17`	`}`