Skip to content

Commit 05ad29f

Browse files
authored
Merge pull request #44 from Dartmouth-Data-Analytics-Core/dev
#43 Improvements to PCA rule
2 parents 0eac55f + eb81311 commit 05ad29f

File tree

6 files changed

+355
-483
lines changed

6 files changed

+355
-483
lines changed

Snakefile

Lines changed: 19 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,6 @@ sample_list = list(samples_df['sample_id'])
1313

1414
print(config)
1515

16-
17-
18-
19-
2016
#####~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2117
# define rules
2218
#####~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -35,7 +31,8 @@ rule all:
3531
expand("rsem/{sample}.genes.results", sample=sample_list) if config["run_rsem"] == "yes" else [],
3632
expand("rsem/{sample}.isoforms.results", sample=sample_list) if config["run_rsem"] == "yes" else [],
3733
"featurecounts/featurecounts.readcounts.tsv",
38-
"plots/PCA_Variance_Bar_Plot.png",
34+
"plots/PCA_top_PC1_vs_PC2.png",
35+
"plots/PCA_top_PCA_variance_bar.png",
3936
"featurecounts/featurecounts.readcounts.ann.tsv",
4037
"featurecounts/featurecounts.readcounts_tpm.tsv",
4138
"featurecounts/featurecounts.readcounts_tpm.ann.tsv",
@@ -45,7 +42,7 @@ rule all:
4542
"featurecounts/featurecounts.readcounts_fpkm.ann.tsv",
4643
conda:
4744
"env_config/multiqc.yaml",
48-
resources: cpus="10", maxtime="2:00:00", mem_mb="60gb",
45+
resources: cpus="10", maxtime="2:00:00", mem_mb=60000,
4946

5047
params:
5148
layout=config["layout"],
@@ -96,7 +93,7 @@ rule trimming:
9693
nextseq_flag = config["cutadapt_nextseq_flag"]
9794
conda:
9895
"env_config/cutadapt.yaml",
99-
resources: cpus="10", maxtime="2:00:00", mem_mb="60gb",
96+
resources: cpus="10", maxtime="2:00:00", mem_mb=60000,
10097

10198
shell: """
10299
if [ "{params.layout}" == "paired" ]
@@ -141,7 +138,7 @@ if config["aligner_name"]=="star":
141138
conda:
142139
"env_config/alignment.yaml",
143140

144-
resources: cpus="10", maxtime="8:00:00", mem_mb="120gb",
141+
resources: cpus="10", maxtime="8:00:00", mem_mb=120000,
145142

146143
shell: """
147144
align_folder="sample_ref/STAR_index"
@@ -183,7 +180,7 @@ if config["aligner_name"]=="star":
183180
conda:
184181
"env_config/alignment.yaml",
185182

186-
resources: cpus="5", maxtime="8:00:00", mem_mb="100gb",
183+
resources: cpus="5", maxtime="8:00:00", mem_mb=100000,
187184

188185
shell: """
189186
align_folder=`cat alignment/index_status.txt`
@@ -238,7 +235,7 @@ if config["aligner_name"]=="hisat":
238235
conda:
239236
"env_config/alignment.yaml",
240237

241-
resources: cpus="4", maxtime="8:00:00", mem_mb="40gb",
238+
resources: cpus="4", maxtime="8:00:00", mem_mb=40000,
242239

243240
shell: """
244241
{params.hisat2} \
@@ -272,7 +269,7 @@ rule alignment_metrics:
272269
conda:
273270
"env_config/samtools.yaml",
274271

275-
resources: cpus="2", maxtime="8:00:00", mem_mb="20gb",
272+
resources: cpus="2", maxtime="8:00:00", mem_mb=20000,
276273

277274
shell: """
278275
{params.samtools} flagstat alignment/{params.sample}.srt.bam > alignment/stats/{params.sample}.srt.bam.flagstat
@@ -290,7 +287,7 @@ rule picard_markdup:
290287
conda:
291288
"env_config/picard.yaml",
292289

293-
resources: cpus="2", maxtime="30:00", mem_mb="20gb",
290+
resources: cpus="2", maxtime="30:00", mem_mb=20000,
294291

295292
shell: """
296293
{params.picard} -Xmx2G -Xms2G \
@@ -319,7 +316,7 @@ rule picard_collectmetrics:
319316
conda:
320317
"env_config/picard.yaml",
321318

322-
resources: cpus="2", maxtime="8:00:00", mem_mb="20gb",
319+
resources: cpus="2", maxtime="8:00:00", mem_mb=20000,
323320

324321
shell: """
325322
{params.picard} -Xmx2G -Xms2G \
@@ -344,7 +341,7 @@ rule rsem:
344341
rsem_paired_flag = '--paired-end' if config["layout"]=='paired' else '',
345342
conda:
346343
"env_config/rsem.yaml",
347-
resources: cpus="10", maxtime="8:00:00", mem_mb="60gb",
344+
resources: cpus="10", maxtime="8:00:00", mem_mb=60000,
348345

349346
shell: """
350347
{params.rsem_calc_exp_path} \
@@ -380,7 +377,7 @@ rule featurecounts:
380377
conda:
381378
"env_config/featurecounts.yaml",
382379

383-
resources: cpus="10", maxtime="8:00:00", mem_mb="100gb",
380+
resources: cpus="10", maxtime="8:00:00", mem_mb=100000,
384381

385382
shell: """
386383
{params.featurecounts} -T 32 {params.pair_flag} -s {params.strand} -a {params.gtf} -o featurecounts/featurecounts.readcounts.raw.tsv {input}
@@ -400,35 +397,21 @@ rule featurecounts:
400397
fi
401398
"""
402399

403-
# The number of genes compared for PCA, chosen by largest variance
404-
num_genes_compared = 500
405-
406400
rule pca_plots:
407401
input: "featurecounts/featurecounts.readcounts.tsv",
408402

409403
output:
410-
"plots/Heatmap_scaled_"+str(num_genes_compared)+"_features.png",
411-
# there potentially could be more, but this plot must exist. Make sure -p flag has number at least 2 if specified
412-
"plots/PCA_1_vs_2.png",
413-
"plots/PCA_Variance_Bar_Plot.png",
414-
"plots/Gene_Variance_Plot.png",
415-
404+
"plots/PCA_top_PC1_vs_PC2.png",
405+
"plots/PCA_top_PCA_variance_bar.png",
416406
params:
417-
num_genes = num_genes_compared,
418-
pca_plot_script = config['pca_plot_script'],
419-
407+
pca_plot_script = config['pca_plot_script'],
420408
conda:
421-
# uses a subset of the packages that featurecounts does
422409
"env_config/pcaplot.yaml",
423-
424-
resources: cpus="1", maxtime="1:00:00", mem_mb="2gb",
425-
410+
resources: cpus="1", maxtime="1:00:00", mem_mb=2000,
426411
shell: """
427412
python {params.pca_plot_script} \
428413
featurecounts/featurecounts.readcounts.tsv \
429-
plots \
430-
--genes_considered {params.num_genes} \
431-
--color_file sample_ref/sample_colors_hex.tsv
414+
plots
432415
"""
433416

434417

@@ -447,7 +430,7 @@ rule check_refs:
447430
picard_rrna_list = config["picard_rrna_list"],
448431
run_rsem = config["run_rsem"],
449432
rsem_ref = config["rsem_ref_path"],
450-
resources: cpus="1", maxtime="1:00:00", mem_mb="2gb",
433+
resources: cpus="1", maxtime="1:00:00", mem_mb=2000,
451434
shell: """
452435
453436
echo "\nChecking for reference annotation GTF file..."
@@ -534,7 +517,7 @@ rule build_refs:
534517
rsem_prepare_path = config["rsem_prep_ref_path"],
535518
conda:
536519
"env_config/build_refs.yaml",
537-
resources: cpus="12", maxtime="8:00:00", mem_mb="48gb",
520+
resources: cpus="12", maxtime="8:00:00", mem_mb=48000,
538521
shell: """
539522
REF_NAME=`basename {params.ref_fa} .fa`
540523
mkdir -p ref/pipeline_refs

cluster_profile/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
jobs: 10
22
jobname: "{rule}.{jobid}"
3-
cluster: "sbatch -t {resources.maxtime} --mem={resources.mem_mb} -c {resources.cpus} --output=log_{rule}_%j.out --mail-type=FAIL"
3+
cluster: "sbatch -t {resources.maxtime} --mem={resources.mem_mb} -c {resources.cpus} --output=slurm_logs/log_{rule}_%j.out --mail-type=FAIL"
44
default-resources: [cpus=1, time_min=60]
55
max-jobs-per-second: 5
66
max-status-checks-per-second: 20

env_config/multiqc.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ channels:
66
dependencies:
77
- nomkl
88
- python=3.10
9-
- multiqc==1.12
9+
- multiqc=1.24.1
10+
- setuptools
11+
- pip
12+
- pip:
13+
- kaleido<1
1014
variables:
1115
PYTHONIOENCODING: utf-8

env_config/pcaplot.yaml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
name: pcaplot
1+
name: pca_plotnine_env
22
channels:
3-
- conda-forge
4-
- bioconda
53
- defaults
64
dependencies:
7-
- nomkl
8-
- numpy==1.23.1
9-
- pandas==1.4.3
10-
- seaborn==0.11.2
11-
- scikit-learn==1.1.1
12-
- matplotlib<3.7 # Pin to older matplotlib
5+
- python=3.11
6+
- scikit-learn
7+
- matplotlib
8+
- numpy
9+
- seaborn
10+
- plotnine
11+
- pandas
12+
- fastcluster
1313

job.script.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
source /optnfs/common/miniconda3/etc/profile.d/conda.sh
2626
conda activate /dartfs/rc/nosnapshots/G/GMBSR_refs/envs/snakemake
2727

28+
#----- Make slurm logs folder
29+
mkdir -p slurm_logs
30+
2831
#----- Call Snakemake
2932
snakemake -s Snakefile \
3033
--conda-frontend conda \

0 commit comments

Comments
 (0)