Skip to content

Commit a656444

Browse files
authored
Merge pull request #7 from ruppinlab/feat-fix-data-dependencies
Feat fix data dependencies
2 parents 46ef407 + 2c4b6e3 commit a656444

File tree

10 files changed

+133
-105
lines changed

10 files changed

+133
-105
lines changed

Aulicino2018/config/PathSeq-config.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@ human_ref:
1313

1414
PathSeq:
1515
bam_file: "output/star/{patient}-{sample}-{plate}/unaligned.bam"
16-
microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
17-
microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
18-
microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
19-
microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
20-
taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
21-
host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
22-
host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
16+
microbe_fasta: "../data/microbev1.fa"
17+
microbe_fai: "../data/microbev1.fa.fai"
18+
microbe_dict: "../data/microbev1.dict"
19+
microbe_bwa_image: "../data/microbev1.fa.img"
20+
taxonomy_db: "../data/microbev1_release201_taxonomy.db"
21+
host_img: "../data/pathseq_host.fa.img"
22+
host_bfi: "../data/pathseq_host.bfi"
2323

2424
VecScreen:
25-
contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
25+
contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
2626

2727
STAR:
2828
FASTQ_dir: "FASTQ/trimmed"

Ben-Moshe2019/config/PathSeq-config.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@ units: data/units.tsv
55

66
PathSeq:
77
bam_file: "output/BAM/{patient}-{sample}-unaligned.bam"
8-
microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
9-
microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
10-
microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
11-
microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
12-
taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
13-
host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
14-
host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
8+
microbe_fasta: "../data/microbev1.fa"
9+
microbe_fai: "../data/microbev1.fa.fai"
10+
microbe_dict: "../data/microbev1.dict"
11+
microbe_bwa_image: "../data/microbev1.fa.img"
12+
taxonomy_db: "../data/microbev1_release201_taxonomy.db"
13+
host_img: "../data/pathseq_host.fa.img"
14+
host_bfi: "../data/pathseq_host.bfi"
1515

1616
VecScreen:
17-
contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
17+
contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
1818

1919
params:
2020
PathSeq: "--min-clipped-read-length 38 --min-base-quality 1 --max-masked-bases 10 --dust-t 24"

Lee2020/config/PathSeq-config.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,16 @@ trimming:
88

99
PathSeq:
1010
bam_file: "output/BAM/{patient}-{sample}-unaligned.bam"
11-
microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
12-
microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
13-
microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
14-
microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
15-
taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
16-
host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
17-
host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
11+
microbe_fasta: "../data/microbev1.fa"
12+
microbe_fai: "../data/microbev1.fa.fai"
13+
microbe_dict: "../data/microbev1.dict"
14+
microbe_bwa_image: "../data/microbev1.fa.img"
15+
taxonomy_db: "../data/microbev1_release201_taxonomy.db"
16+
host_img: "../data/pathseq_host.fa.img"
17+
host_bfi: "../data/pathseq_host.bfi"
1818

1919
VecScreen:
20-
contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
20+
contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
2121

2222
params:
2323
PathSeq: "--min-clipped-read-length 50 --min-base-quality 1 --max-masked-bases 10 --dust-t 24 "

Maynard2020/config/PathSeq-config.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@ trimming:
2626

2727
PathSeq:
2828
bam_file: "output/star/{patient}-{sample}-{plate}/unaligned.bam"
29-
microbe_fasta: "../Aulicino2018/data/microbev1.fa"
30-
microbe_fai: "../Aulicino2018/data/microbev1.fa.fai"
31-
microbe_dict: "../Aulicino2018/data/microbev1.dict"
32-
microbe_bwa_image: "../Aulicino2018/data/microbev1.fa.img"
33-
taxonomy_db: "../Aulicino2018/data/microbev1_release201_taxonomy.db"
34-
host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host_ERCC92.img"
35-
host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
29+
microbe_fasta: "../data/microbev1.fa"
30+
microbe_fai: "../data/microbev1.fa.fai"
31+
microbe_dict: "../data/microbev1.dict"
32+
microbe_bwa_image: "../data/microbev1.fa.img"
33+
taxonomy_db: "../data/microbev1_release201_taxonomy.db"
34+
host_img: "../data/pathseq_host.fa.img"
35+
host_bfi: "../data/pathseq_host.bfi"
3636

3737
VecScreen:
38-
contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
38+
contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"

Paulson2018/config/PathSeq-config.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,16 @@ params:
99

1010
PathSeq:
1111
bam_file: "output/BAM/{patient}-{sample}-unaligned.bam"
12-
microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
13-
microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
14-
microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
15-
microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
16-
taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
17-
host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
18-
host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
12+
microbe_fasta: "../data/microbev1.fa"
13+
microbe_fai: "../data/microbev1.fa.fai"
14+
microbe_dict: "../data/microbev1.dict"
15+
microbe_bwa_image: "../data/microbev1.fa.img"
16+
taxonomy_db: "../data/microbev1_release201_taxonomy.db"
17+
host_img: "../data/pathseq_host.fa.img"
18+
host_bfi: "../data/pathseq_host.bfi"
1919

2020
VecScreen:
21-
contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
21+
contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
2222

2323
CellRanger:
2424
genome_dir: "$CELLRANGER_REF300/GRCh38"

README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,29 @@ Currently, we support three distinct approaches for quantifying the number of re
4242

4343
The workflows assume that the files needed by PathSeq and CAMMiQ are pre-built and their location is specified in `config/PathSeq-config.yaml`. The index used by PathSeq in this project is ~41 GB while the indices used by CAMMiQ are > 200 GB so we do not distribute them with the rest of the package although they are available upon request.
4444

45+
The existing projects assume that these database files will be in the top-level directory so the first step is to create the directory `CSI-Microbes-identification/data` (if it doesn't already exist) using
46+
47+
```
48+
mkdir data
49+
```
50+
51+
The PathSeq files can be divided into two groups: the microbial reference files and the host (human) reference files. To download the host reference files (which can take a while), use the below commands
52+
53+
```
54+
cd data
55+
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/pathseq/pathseq_host.tar.gz
56+
tar -xf pathseq_host.tar.gz
57+
cd ..
58+
```
59+
60+
The microbial reference fasta file (`microbev1.fa`), VecScreen hits (`microbev1-vecscreen-combined-matches.bed`) and taxonomy hierarchy file (`microbev1_release201_taxonomy.db`) used in our paper are available from [zenodo](https://zenodo.org/record/5604433). The microbial reference fasta file (`microbev1.fa`) can be used to build the additional microbial files required by PathSeq (`microbev1.fa.fai`, `microbev1.dict` and `microbev1.fa.img`) using the below command.
61+
62+
```
63+
./scripts/run-build-PathSeq-microbe-files.sh
64+
```
65+
66+
Please note that building the `microbev1.fa.img` file will take ~1 day.
67+
4568
### Construction of PathSeq files
4669

4770
An example of how to build the PathSeq index (and other required files) is available in `build-PathSeq-microbes-files/Snakefile`.
@@ -360,6 +383,14 @@ to
360383
}
361384
```
362385

386+
### What if I don't have access to Biowulf?
387+
388+
Biowulf is the NIH's linux cluster that uses the slurm workload manager. It should be relatively straightforward to run CSI-Microbes-identification to another linux cluster that uses the slurm workload manager. Users should ensure that there is a module system that includes snakemake (6.0.5)<sup>[REF](#Snakemake)</sup>, sratoolkit (2.10.9)<sup>[REF](#SRAToolkit)</sup>, cellranger (5.0.1)<sup>[REF](#CellRanger)</sup>, samtools (1.11)<sup>[REF](#SAMtools)</sup>, bedtools (2.29.2)<sup>[REF](#BedTools)</sup>, and picard (latest=2.25.0)<sup>[REF](#Picard)</sup>.
389+
390+
Users may also need to change the names of the partitions to the partitions used by their server (see above question for an example). To speed up and effectively parallelize the PathSeq step, it is important to use nodes with at least 200 GB of local storage because we copy the PathSeq files to the local node before running PathSeq. In our experience, running PathSeq on multiple nodes using the same reference files will be much slower because of network latency as well competition for access to the one PathSeq reference files between the nodes (which will also sometimes cause errors).
391+
392+
Currently, the example 10x analyses use an HG38 genome that exists on biowulf. Therefore, the `CellRanger`: `genome_dir` value in `config/PathSeq-config.yaml` will need to updated as well.
393+
363394
### What are the expected output files?
364395

365396
The expected output files from CSI-Microbes-identification are pathseq.txt files, which are output in `output/PathSeq`. For example, the pathseq file for cell barcode TTTCCTCTCCACTGGG-1 from sample GSM3454529 (exposed to _Salmonella_) is located at `output/PathSeq/Pt0-GSM3454529-TTTCCTCTCCACTGGG-1/pathseq.txt`. These output files are used as input to [CSI-Microbes-analysis](https://github.com/ruppinlab/CSI-Microbes-analysis), which computes the differential abundance of microbes across cell-types.

build-PathSeq-microbe-files/Snakefile

Lines changed: 44 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -3,46 +3,44 @@ import pandas as pd
33

44
# URLs
55
# if not using current release, should be "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/archive/RefSeq-{release}.catalog.gz"
6-
REFSEQ_CATALOG_URL = "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/archive/RefSeq-{release}.catalog.gz"
6+
# REFSEQ_CATALOG_URL = "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/archive/RefSeq-{release}.catalog.gz"
77
# PathSeq expects taxdump in tar.gz file format - the archived taxdump files are in zip
88
# NCBI_TAX_DUMP_URL = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
99

1010
# Directories
11-
RAW_DIR = "raw"
12-
DATA_DIR = "data"
11+
#RAW_DIR = "raw"
12+
DATA_DIR = "../data"
1313

1414
# other downloaded files
15-
REFSEQ_CATALOG = join(RAW_DIR, "RefSeq-{release}.catalog.gz")
16-
NCBI_TAX_DUMP = join(RAW_DIR, "ncbi_taxdump.tar.gz")
15+
#REFSEQ_CATALOG = join(RAW_DIR, "RefSeq-{release}.catalog.gz")
16+
#NCBI_TAX_DUMP = join(RAW_DIR, "ncbi_taxdump.tar.gz")
1717

1818
# output files
19-
TAXONOMY = join(DATA_DIR, "{microbe}_{release}_taxonomy.db")
20-
FASTA_FILE = join(DATA_DIR, "{microbe}.fa")
21-
FASTA_IDX_FILE = join(DATA_DIR, "{microbe}.fa.fai")
22-
FASTA_DICT_FILE = join(DATA_DIR, "{microbe}.dict")
23-
BWA_IMAGE_INDEX = join(DATA_DIR, "{microbe}.fa.img")
19+
TAXONOMY = join(DATA_DIR, "microbev1_release201_taxonomy.db")
20+
FASTA_FILE = join(DATA_DIR, "microbev1.fa")
21+
FASTA_IDX_FILE = join(DATA_DIR, "microbev1.fa.fai")
22+
FASTA_DICT_FILE = join(DATA_DIR, "microbev1.dict")
23+
BWA_IMAGE_INDEX = join(DATA_DIR, "microbev1.fa.img")
2424

2525

2626
rule all:
2727
input:
28-
TAXONOMY.format(microbe="microbev1", release="release201"),
29-
FASTA_FILE.format(microbe="microbev1"),
3028
FASTA_IDX_FILE.format(microbe="microbev1"),
3129
FASTA_DICT_FILE.format(microbe="microbev1"),
3230
BWA_IMAGE_INDEX.format(microbe="microbev1"),
3331

3432

35-
rule download_combine_subspe_genomes:
36-
input:
37-
join("data", "microbe_subset_assembly_summary.txt")
38-
output:
39-
join("data", "microbev1.fa")
40-
run:
41-
df = pd.read_csv(input[0], sep="\t")
42-
shell("touch {output}")
43-
for index, row in df.iterrows():
44-
url = row["url"]
45-
shell("wget -O - {url} | gunzip -c >> {output}")
33+
# rule download_combine_subspe_genomes:
34+
# input:
35+
# join("data", "microbe_subset_assembly_summary.txt")
36+
# output:
37+
# join("data", "microbev1.fa")
38+
# run:
39+
# df = pd.read_csv(input[0], sep="\t")
40+
# shell("touch {output}")
41+
# for index, row in df.iterrows():
42+
# url = row["url"]
43+
# shell("wget -O - {url} | gunzip -c >> {output}")
4644

4745

4846
# rules for generating PathSeq data
@@ -54,30 +52,30 @@ rule download_combine_subspe_genomes:
5452
# shell:
5553
# "wget {params} -O {output}"
5654

57-
rule download_RefSeq_accession_catalog:
58-
params:
59-
REFSEQ_CATALOG_URL
60-
output:
61-
REFSEQ_CATALOG
62-
shell:
63-
"wget {params} -O {output}"
55+
# rule download_RefSeq_accession_catalog:
56+
# params:
57+
# REFSEQ_CATALOG_URL
58+
# output:
59+
# REFSEQ_CATALOG
60+
# shell:
61+
# "wget {params} -O {output}"
6462

65-
rule build_taxonomy_file:
66-
input:
67-
fa = FASTA_FILE,
68-
catalog = REFSEQ_CATALOG,
69-
taxdump = NCBI_TAX_DUMP,
70-
fai = FASTA_IDX_FILE,
71-
dict = FASTA_DICT_FILE
72-
output:
73-
TAXONOMY
74-
shell:
75-
"module load GATK/4.1.8.1 && "
76-
"gatk PathSeqBuildReferenceTaxonomy "
77-
"-R '{input.fa}' "
78-
"--refseq-catalog '{input.catalog}' "
79-
"--tax-dump '{input.taxdump}' "
80-
"-O '{output}'"
63+
# rule build_taxonomy_file:
64+
# input:
65+
# fa = FASTA_FILE,
66+
# catalog = REFSEQ_CATALOG,
67+
# taxdump = NCBI_TAX_DUMP,
68+
# fai = FASTA_IDX_FILE,
69+
# dict = FASTA_DICT_FILE
70+
# output:
71+
# TAXONOMY
72+
# shell:
73+
# "module load GATK/4.1.8.1 && "
74+
# "gatk PathSeqBuildReferenceTaxonomy "
75+
# "-R '{input.fa}' "
76+
# "--refseq-catalog '{input.catalog}' "
77+
# "--tax-dump '{input.taxdump}' "
78+
# "-O '{output}'"
8179

8280
rule create_fasta_dict:
8381
input:

build-PathSeq-microbe-files/config/cluster.json

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,8 @@
1010
"build_BWA_image":
1111
{
1212
"nthreads": 2,
13-
"mem": "400g",
13+
"mem": "100g",
1414
"gres": 200,
15-
"time": "4-00:00:00",
16-
"partition": "largemem"
15+
"time": "2-00:00:00"
1716
}
1817
}

test-10x/config/PathSeq-config.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@ units: data/units.tsv
55

66
PathSeq:
77
bam_file: "output/BAM/{patient}-{sample}-unaligned.bam"
8-
microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
9-
microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
10-
microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
11-
microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
12-
taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
13-
host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
14-
host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
8+
microbe_fasta: "../data/microbev1.fa"
9+
microbe_fai: "../data/microbev1.fa.fai"
10+
microbe_dict: "../data/microbev1.dict"
11+
microbe_bwa_image: "../data/microbev1.fa.img"
12+
taxonomy_db: "../data/microbev1_release201_taxonomy.db"
13+
host_img: "../data/pathseq_host.fa.img"
14+
host_bfi: "../data/pathseq_host.bfi"
1515

1616
VecScreen:
17-
contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
17+
contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
1818

1919
params:
2020
PathSeq: "--min-clipped-read-length 38 --min-base-quality 1 --max-masked-bases 10 --dust-t 24"

test-SS2/config/PathSeq-config.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@ human_ref:
1313

1414
PathSeq:
1515
bam_file: "output/star/{patient}-{sample}-{plate}/unaligned.bam"
16-
microbe_fasta: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa"
17-
microbe_fai: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.fai"
18-
microbe_dict: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.dict"
19-
microbe_bwa_image: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1.fa.img"
20-
taxonomy_db: "/data/Robinson-SB/CSI-Microbes-identification/Aulicino2018/data/microbev1_release201_taxonomy.db"
21-
host_img: "/data/Robinson-SB/PathSeq-data/pathseq_host.fa.img"
22-
host_bfi: "/data/Robinson-SB/PathSeq-data/pathseq_host.bfi"
16+
microbe_fasta: "../data/microbev1.fa"
17+
microbe_fai: "../data/microbev1.fa.fai"
18+
microbe_dict: "../data/microbev1.dict"
19+
microbe_bwa_image: "../data/microbev1.fa.img"
20+
taxonomy_db: "../data/microbev1_release201_taxonomy.db"
21+
host_img: "../data/pathseq_host.fa.img"
22+
host_bfi: "../data/pathseq_host.bfi"
2323

2424
VecScreen:
25-
contaminant_hits: "/data/Robinson-SB/run-VecScreen/output/microbev1-vecscreen-combined-matches.bed"
25+
contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
2626

2727
STAR:
2828
FASTQ_dir: "FASTQ/trimmed"

0 commit comments

Comments
 (0)