Skip to content

Commit d954f54

Browse files
authored
Merge pull request #10 from ruppinlab/feat-Robinson2023
Feat Robinson2023
2 parents a656444 + da289d2 commit d954f54

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+640602
-48250
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
data/*
12
**/.snakemake
23
**/.DS_Store
34
**/output

Ben-Moshe2019/run-SRPRISM.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ rule add_CR_tags_SRPRISM:
4141
SRPRISM_CB_UMI_TABLE,
4242
SRPRISM_CB_UMI_COUNT
4343
script:
44-
"src/add_CR_tags_to_SRPRISM_bam.py"
44+
"../src/add_CR_tags_to_SRPRISM_bam.py"
4545

4646
# get a read count per gene per sample file
4747
rule intersect_BAM_GFF:

Lee2020/Snakefile

Lines changed: 0 additions & 61 deletions
This file was deleted.

Lee2020/data/patients.tsv

Lines changed: 0 additions & 10 deletions
This file was deleted.

Lee2020/data/samples.tsv

Lines changed: 0 additions & 173 deletions
This file was deleted.

Lee2020/data/units.tsv

Lines changed: 0 additions & 27415 deletions
This file was deleted.

Ma2021/Snakefile

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
from os.path import join
2+
import pandas as pd
3+
from snakemake.utils import min_version
4+
##### set minimum snakemake version #####
5+
min_version("5.1.2")
6+
7+
##### load config and sample sheets #####
8+
configfile: "config/PathSeq-config.yaml"
9+
10+
11+
12+
cells = pd.read_csv(config["units"], sep="\t").set_index(["sample", "barcode", "patient"], drop=False)
13+
14+
samples = pd.read_csv(config["samples"], sep="\t").set_index(["patient", "sample"], drop=False)
15+
# samples = samples.loc[samples["Maximum read length"] == "57bp"]
16+
cells = cells.loc[cells["sample"].isin(samples["sample"])]
17+
# samples = cells[["patient", "sample", "library_id", "sample_prefix"]].reset_index(drop=True).drop_duplicates().set_index(["patient", "sample"], drop=False)
18+
# samples = samples.iloc[0:2]
19+
20+
# cells = cells.loc[cells.patient.isin(["H08", "H37"])]
21+
wildcard_constraints:
22+
patient="|".join(samples["patient"]),
23+
sample="|".join(samples["sample"])
24+
25+
# Snakemake includes
26+
# include: "../RNA-snakemake-rules/rules/cellranger.smk"
27+
include: "../pathogen-discovery-rules/rules/PathSeq-10x.smk"
28+
29+
# Directories
30+
CR_SAMPLE_ODIR = "{patient}-{sample}"
31+
FASTQ_DIR = "/data/Robinson-SB/scRNAseq_46samples_fastq/{library}"
32+
33+
# CellRanger Files
34+
CR_BAM_FILE = join(CR_SAMPLE_ODIR, "outs", "possorted_genome_bam.bam")
35+
36+
PATIENT_FASTQ_DIR = join("FASTQ", "raw", "{patient}")
37+
38+
# cellranger complains when you pass directory as --id
39+
CR_SAMPLE_ODIR = "{patient}-{sample}"
40+
41+
42+
43+
44+
# PathSeq files
45+
PATHSEQ_BAM = join("output", "PathSeq", "{patient}-{sample}", "pathseq.bam")
46+
PATHSEQ_CELL_SCORE = join("output", "PathSeq", "{patient}-{sample}-{cell}", "pathseq.txt")
47+
48+
rule all:
49+
input:
50+
# expand(PATHSEQ_BAM, zip, patient=samples["patient"], sample=samples["sample"]),
51+
expand(PATHSEQ_CELL_SCORE, zip, patient=cells["patient"], sample=cells["sample"], cell=cells["barcode"])
52+
#expand(SRPRISM_TAG_BAM, zip, patient=samples["patient"], sample=samples["sample"], genome=samples["genome"])
53+
54+
def get_cellranger_input_directory(wildcards):
55+
library_id = samples.loc[(wildcards.patient, wildcards.sample), "library_id"]
56+
return {
57+
"dir": expand(FASTQ_DIR, library=library_id)
58+
}
59+
60+
def get_library_id(wildcards):
61+
return samples.loc[(wildcards.patient, wildcards.sample), "sample_prefix"]
62+
63+
# expected input format for FASTQ file
64+
rule cellranger_count:
65+
input:
66+
unpack(get_cellranger_input_directory)
67+
params:
68+
PATIENT_FASTQ_DIR,
69+
CR_SAMPLE_ODIR,
70+
config["CellRanger"]["genome_dir"],
71+
config["CellRanger"]["chemistry"],
72+
get_library_id
73+
output:
74+
CR_BAM_FILE
75+
shell:
76+
"module load cellranger/5.0.1 && "
77+
# snakemake auto creates directories for output files but cellranger expects existing directories to pipestance directory
78+
"rm -rf {params[1]} && "
79+
"cellranger count --id={params[1]} "
80+
"--fastqs={input[0]}/ " # this is the path to the directory containing the FASTQ files
81+
"--sample={params[4]} " # this is the sample to use
82+
"--transcriptome={params[2]} "
83+
"--localcores=$SLURM_CPUS_PER_TASK "
84+
"--chemistry={params[3]} "
85+
"--localmem=60"
Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@ patients: data/patients.tsv
33
samples: data/samples.tsv
44
units: data/units.tsv
55

6-
trimming:
7-
skip: True
8-
96
PathSeq:
107
bam_file: "output/BAM/{patient}-{sample}-unaligned.bam"
118
microbe_fasta: "../data/microbev1.fa"
@@ -20,7 +17,7 @@ VecScreen:
2017
contaminant_hits: "../data/microbev1-vecscreen-combined-matches.bed"
2118

2219
params:
23-
PathSeq: "--min-clipped-read-length 50 --min-base-quality 1 --max-masked-bases 10 --dust-t 24 "
20+
PathSeq: "--min-clipped-read-length 38 --min-base-quality 1 --max-masked-bases 10 --dust-t 24"
2421
PathSeqScore: ""
2522

2623
CellRanger:
Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@
3535
"mem": "150g",
3636
"time": "4:00:00"
3737
},
38+
"add_CR_tags_SRPRISM":
39+
{
40+
"mem": "150g",
41+
"time": "4:00:00"
42+
},
3843
"split_PathSeq_BAM_by_CB_UB":
3944
{
4045
"mem": "8g",
@@ -45,6 +50,11 @@
4550
{
4651
"mem": "32g",
4752
"time": "4:00:00",
48-
"nthreads": 16
53+
"nthreads": 1
54+
},
55+
"run_CAMMiQ_species_long_reads":
56+
{
57+
"mem": "650g",
58+
"partition": "largemem"
4959
}
5060
}

Ma2021/data/samples.tsv

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
patient sample S_ID library_id sample_prefix Maximum read length 10x_chemistry cancer_type
2+
C25 C25 S038 9_XW_SC_1-26-2017 9_XW_SC_1-26-2017 57bp v2 CCA
3+
C26 C26a S020 Sample_CS023169_Wang_0823_1626-12-23_SCAF350 CS023169_Wang_0823_1626-12-23_SCAF350 98bp v2 CCA
4+
C26 C26b S037 8_XW_SC_1-26-2017 8_XW_SC_1-26-2017 57bp v2 CCA
5+
C29 C29 S040 11_XW_SC_3-1-2017 11_XW_SC_3-1-2017 57bp v2 CCA
6+
C35 C35 S045 20_XW_SC_7-26-2017 20_XW_SC_7-26-2017 57bp v2 CCA
7+
C39 C39 S044 19_XW_SC_7-27-2017 19_XW_SC_7-27-2017 57bp v2 CCA
8+
C42 C42 S025 Sample_CS023169_Wang_0823_1642_SCAF355 CS023169_Wang_0823_1642_SCAF355 98bp v2 CCA
9+
C46 C46a S027 Sample_CS023169_Wang_0823_1646-12-1_SCAF357 CS023169_Wang_0823_1646-12-1_SCAF357 98bp v2 CCA
10+
C46 C46b S028 Sample_CS023169_Wang_0824_1646-1-17_SCAF358 CS023169_Wang_0824_1646-1-17_SCAF358 98bp v2 CCA
11+
C52 C52 S016 Sample_CS023169_Wang_0731_1652_SCAF303 CS023169_Wang_0731_1652_SCAF303 98bp v2 CCA
12+
C56 C56 S018 Sample_CS023169_Wang_0731_1656_SCAF305 CS023169_Wang_0731_1656_SCAF305 98bp v2 CCA
13+
C60 C60 S015 Sample_CS023169_Wang_0730_1660r_SCAF300 CS023169_Wang_0730_1660r_SCAF300 98bp v2 CCA
14+
C66 C66 S034 Sample_CS023169_Wang_0824_1666_SCAF365 CS023169_Wang_0824_1666_SCAF365 98bp v2 CCA
15+
C76 C76 S010 CS025253_SCAF880_LCS1676 CS025253_SCAF880_LCS1676 98bp v2 CCA
16+
H08 H08 S001 CS024371_SCAF529_LCS1608 CS024371_SCAF529_LCS1608 98bp v2 HCC
17+
H18 H18 S043 16_XW_SC_6-13-2017 16_XW_SC_6-13-2017 57bp v2 HCC
18+
H21 H21 S035 2_XW_SC_10-25-2016 2_XW_SC_10-25-2016 57bp v2 HCC
19+
H23 H23 S039 10_XW_SC_2-2-2017 10_XW_SC_2-2-2017 57bp v2 HCC
20+
H28 H28 S036 7_XW_SC_1-12-2017 7_XW_SC_1-12-2017 57bp v2 HCC
21+
H30 H30 S041 12_XW_SC_3-2-2017 12_XW_SC_3-2-2017 57bp v2 HCC
22+
H34 H34b S021 Sample_CS023169_Wang_0823_1634-1-10_SCAF352 CS023169_Wang_0823_1634-1-10_SCAF352 98bp v2 HCC
23+
H34 H34a S022 Sample_CS023169_Wang_0823_1634-12-1_SCAF351 CS023169_Wang_0823_1634-12-1_SCAF351 98bp v2 HCC
24+
H34 H34c S023 Sample_CS023169_Wang_0823_1634-4-15_SCAF353 CS023169_Wang_0823_1634-4-15_SCAF353 98bp v2 HCC
25+
H37 H37 S046 21_XW_SC_8-1-2017 21_XW_SC_8-1-2017 57bp v2 HCC
26+
H38 H38 S042 15_XW_SC_6-8-2017 15_XW_SC_6-8-2017 57bp v2 HCC
27+
H41 H41 S024 Sample_CS023169_Wang_0823_1641_SCAF354 CS023169_Wang_0823_1641_SCAF354 98bp v2 HCC
28+
H43 H43 S026 Sample_CS023169_Wang_0823_1643_SCAF356 CS023169_Wang_0823_1643_SCAF356 98bp v2 HCC
29+
H49 H49b S008 CS025253_SCAF872_LCS1649 CS025253_SCAF872_LCS1649 98bp v2 HCC
30+
H49 H49a S029 Sample_CS023169_Wang_0824_1649_SCAF359 CS023169_Wang_0824_1649_SCAF359 98bp v2 HCC
31+
H54 H54 S014 Sample_CS023169_Wang_0730_1654_SCAF301 CS023169_Wang_0730_1654_SCAF301 98bp v2 HCC
32+
H55 H55 S017 Sample_CS023169_Wang_0731_1655_SCAF304 CS023169_Wang_0731_1655_SCAF304 98bp v2 HCC
33+
H58 H58c S012 SCAF637 SCAF637 98bp v2 HCC
34+
H58 H58a S030 Sample_CS023169_Wang_0824_1658-5-31_SCAF361 CS023169_Wang_0824_1658-5-31_SCAF361 98bp v2 HCC
35+
H58 H58b S031 Sample_CS023169_Wang_0824_1658-7-11_SCAF362 CS023169_Wang_0824_1658-7-11_SCAF362 98bp v2 HCC
36+
H62 H62 S019 Sample_CS023169_Wang_0731_1662r_SCAF307 CS023169_Wang_0731_1662r_SCAF307 98bp v2 HCC
37+
H63 H63 S032 Sample_CS023169_Wang_0824_1663_SCAF363 CS023169_Wang_0824_1663_SCAF363 98bp v2 HCC
38+
H65 H65 S033 Sample_CS023169_Wang_0824_1665_SCAF364 CS023169_Wang_0824_1665_SCAF364 98bp v2 HCC
39+
H68 H68a S011 SCAF372 SCAF372 98bp v2 HCC
40+
H68 H68b S013 SCAF589 SCAF589 98bp v2 HCC
41+
H70 H70 S002 CS023169_SCAF592 CS023169_SCAF592 98bp v2 HCC
42+
H72 H72 S003 CS023169_SCAF672 CS023169_SCAF672 98bp v3 HCC
43+
H73 H73a S004 CS023169_SCAF694 CS023169_SCAF694 98bp v3 HCC
44+
H73 H73b S006 CS025253_SCAF765_LCS1673_2 CS025253_SCAF765_LCS1673_2 98bp v2 HCC
45+
H74 H74 S005 CS025253_SCAF764_LCS1674 CS025253_SCAF764_LCS1674 98bp v2 HCC
46+
H75 H75 S007 CS025253_SCAF850_LCS1675 CS025253_SCAF850_LCS1675 98bp v2 HCC
47+
H77 H77 S009 CS025253_SCAF873_LCS1677 CS025253_SCAF873_LCS1677 98bp v2 HCC

0 commit comments

Comments
 (0)