Skip to content

Commit 603a57d

Browse files
committed
Support for single-end data
1 parent 88b2868 commit 603a57d

File tree

3 files changed

+325
-1
lines changed

3 files changed

+325
-1
lines changed

MINTIE_SE.groovy

Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
/*
2+
__ __ ___ _ _ _____ ___ _____
3+
| \/ |_ _| \ | |_ _|_ _| ____|
4+
| |\/| || || \| | | | | || _|
5+
| | | || || |\ | | | | || |___
6+
|_| |_|___|_| \_| |_| |___|_____|
7+
8+
Method for Inferring Novel Transcripts and Isoforms using Equivalences classes
9+
10+
Author: Marek Cmero
11+
*/
12+
13+
code_base = file(bpipe.Config.config.script).parentFile.absolutePath
14+
load code_base + "/tools.groovy"
15+
load code_base + "/references.groovy"
16+
17+
// initialise defaults if not provided
18+
if(!binding.variables.containsKey("fastqCaseFormat")){
19+
fastqCaseFormat="cases/%_R*.fastq.gz"
20+
}
21+
if(!binding.variables.containsKey("fastqControlFormat")){
22+
fastqControlFormat="controls/%_R*.fastq.gz"
23+
}
24+
if(!binding.variables.containsKey("assemblyFasta")){
25+
assemblyFasta=""
26+
}
27+
if(!binding.variables.containsKey("run_de_step")){
28+
run_de_step="true"
29+
}
30+
if(!binding.variables.containsKey("splice_motif_mismatch")){
31+
splice_motif_mismatch=0
32+
}
33+
34+
fastq_dedupe = {
35+
from("*.gz"){
36+
def sample_name = branch.name
37+
output.dir = sample_name
38+
produce(sample_name+'.1.fastq.gz'){
39+
exec """
40+
$dedupe $input.gz $output
41+
""", "fastq_dedupe"
42+
}
43+
}
44+
}
45+
46+
trim = {
47+
output.dir = branch.name
48+
produce('trim1.fastq.gz') {
49+
if (assemblyFasta != '') {
50+
// no need to trim if assembly provided
51+
exec """
52+
touch $output1 ; touch $output2
53+
"""
54+
} else {
55+
exec """
56+
$trimmomatic SE -threads $threads -phred$scores $input1
57+
$output1.prefix
58+
LEADING:$minQScore TRAILING:$minQScore MINLEN:$min_read_length ;
59+
gzip $output1.prefix ;
60+
""", "trim"
61+
}
62+
}
63+
}
64+
65+
assemble = {
66+
def sample_name = branch.name
67+
def Ks_for_soap = Ks.toString().contains(',') ? Ks.split(',').join(' ') : Ks
68+
output.dir = sample_name
69+
produce(sample_name + '_denovo_filt.fasta'){
70+
if (assemblyFasta != '') {
71+
exec """
72+
ln -s $assemblyFasta $output ;
73+
"""
74+
} else if (assembler.toLowerCase() == 'trinity') {
75+
exec """
76+
$Trinity --seqType fq --max_memory ${assembly_mem}G --output $sample_name/trinity_assembly \
77+
--left $input1 --right $input2 --CPU $threads ;
78+
ln -s trinity_assembly/Trinity.fasta $output ;
79+
""", "assemble"
80+
} else if (assembler.toLowerCase() == 'spades') {
81+
exec """
82+
$rnaspades -1 $input1 -2 $input2 -k $Ks -t $threads -m $assembly_mem -o $sample_name/SPAdes_assembly ;
83+
ln -s SPAdes_assembly/contigs.fasta $output ;
84+
""", "assemble"
85+
} else {
86+
exec """
87+
rlens=`gunzip -c $input1 \
88+
| awk -v mrl=$min_read_length 'BEGIN {minlen = mrl; maxlen = 0} {
89+
if (NR % 4 == 2) {
90+
rlen = length(\$1) ;
91+
if (rlen > maxlen) {maxlen = rlen}
92+
if (rlen < minlen) {minlen = rlen}
93+
}} END {print minlen" "maxlen}'` ;
94+
min_rlen=\${rlens% *} ;
95+
max_rlen=\${rlens#* } ;
96+
97+
if [ ! -d $output.dir/SOAPassembly ]; then
98+
mkdir $output.dir/SOAPassembly ;
99+
fi ;
100+
cd $output.dir/SOAPassembly ;
101+
102+
echo \"max_rd_len=\$max_rlen\" > config.config ;
103+
echo -e \"[LIB]\\nq=../../$input1\" >> config.config ;
104+
if [ -e SOAP.fasta ]; then rm SOAP.fasta ; fi ;
105+
for k in $Ks_for_soap ; do
106+
if [ \$k -gt \$min_rlen ]; then
107+
echo "WARNING: Kmer size \$k exceeds minimum read length \${min_rlen}. Please double check parameters." ;
108+
else
109+
$soapdenovotrans pregraph -s config.config -o outputGraph_\$k -K \$k -p $threads ;
110+
$soapdenovotrans contig -g outputGraph_\$k ;
111+
cat outputGraph_\$k.contig | sed "s/^>/>k\${k}_/g" >> SOAP.fasta ;
112+
fi ;
113+
done ;
114+
115+
cd ../../ ;
116+
$dedupe in=$sample_name/SOAPassembly/SOAP.fasta out=stdout.fa threads=$threads overwrite=true | \
117+
$fasta_formatter | \
118+
awk '!/^>/ { next } { getline seq } length(seq) > $min_contig_len { print \$0 "\\n" seq }' > $output ;
119+
if [ ! -s $output ] ; then
120+
rm $output ;
121+
echo "ERROR: de novo assembled contigs fasta file is empty." ;
122+
echo "Please check paths for SOAPdenovoTrans, dedupe and fasta" ;
123+
echo "formatter are correct, and their dependencies are installed." ;
124+
fi ;
125+
""", "assemble"
126+
}
127+
}
128+
}
129+
130+
create_salmon_index = {
131+
def sample_name = branch.name
132+
def salmon_index = sample_name + "/all_fasta_index"
133+
output.dir = salmon_index
134+
def index_fasta = output.dir + "/" + sample_name + ".fasta"
135+
produce(index_fasta, '*.bin'){
136+
exec """
137+
cat $trans_fasta $input.fasta > $output1 ;
138+
$salmon index -t $output1 -i $salmon_index -p $threads ;
139+
""", "create_salmon_index"
140+
}
141+
}
142+
143+
run_salmon = {
144+
def workingDir = System.getProperty("user.dir");
145+
def rf = inputs.split().collect { workingDir+"/$it" }[0]
146+
def salmon_index="all_fasta_index"
147+
def base_outdir = "salmon_out"
148+
def controls_dir = fastqControlFormat.split("/")[-2]
149+
def sample_name = branch.name
150+
151+
if(type == "controls"){
152+
sample_name = branch.parent.parent.name
153+
def control_name = branch.name
154+
155+
output.dir = sample_name + "/" + controls_dir + "/" + control_name + "_salmon_out/aux_info"
156+
base_outdir = control_name + "_salmon_out"
157+
salmon_index = "../all_fasta_index"
158+
} else {
159+
output.dir = sample_name + "/salmon_out/aux_info"
160+
}
161+
162+
produce("eq_classes.txt*"){
163+
exec """
164+
cd $output.dir/../.. ;
165+
$salmon quant --dumpEq --seqBias --validateMappings --hardFilter -i $salmon_index -l A -r $rf -p $threads -o $base_outdir
166+
""", "run_salmon"
167+
}
168+
}
169+
170+
create_ec_count_matrix = {
171+
def sample_name = branch.name
172+
def sample_names = inputs.split().collect { it.split('/')[-3].split('_salmon_out')[0] }
173+
sample_names.set(0, sample_name) // case sample, rest are controls
174+
sample_names = sample_names.join(',')
175+
output.dir = sample_name
176+
produce("ec_count_matrix.txt"){
177+
exec """
178+
$python $code_base/DE/create_ec_count_matrix.py $inputs $sample_names $output1 ;
179+
""", "create_ec_count_matrix"
180+
}
181+
}
182+
183+
run_de = {
184+
def run_de_bool = run_de_step.toBoolean()
185+
def sample_name = branch.name
186+
output.dir = sample_name
187+
produce("eq_classes_de.txt"){
188+
if(run_de_bool) {
189+
exec """
190+
${R}script $code_base/DE/compare_eq_classes.R $sample_name $input $trans_fasta $output --FDR=$fdr --minCPM=$min_cpm --minLogFC=$min_logfc
191+
""", "run_de"
192+
} else {
193+
exec """
194+
$python $code_base/DE/get_novel_contigs.py $input $trans_fasta $output.dir/${sample_name}_denovo_filt.fasta
195+
""", "run_de"
196+
}
197+
}
198+
}
199+
200+
filter_on_significant_ecs = {
201+
def sample_name = branch.name
202+
output.dir = sample_name
203+
produce("de_contigs.fasta"){
204+
exec """
205+
$python $code_base/util/filter_fasta.py $output.dir/${sample_name}_denovo_filt.fasta $input.txt --col_id contig > $output1 ;
206+
""", "filter_sig_ecs"
207+
}
208+
}
209+
210+
align_contigs_against_genome = {
211+
def sample_name = branch.name
212+
output.dir = sample_name
213+
produce('aligned_contigs_against_genome.sam'){
214+
exec """
215+
$gmap -D $gmap_refdir -d $gmap_genome -f samse -t $threads -x $min_gap --max-intronlength-ends=500000 -n 0 $input.fasta > $output
216+
""", "align_contigs_against_genome"
217+
}
218+
}
219+
220+
annotate_contigs = {
221+
def sample_name = branch.name
222+
output.dir = sample_name
223+
produce("annotated_contigs.vcf", "annotated_contigs_info.tsv", "annotated_contigs.bam"){
224+
exec """
225+
$python ${code_base}/annotate/annotate_contigs.py \
226+
$sample_name $input.bam \
227+
$ann_info $tx_annotation \
228+
$output.bam $output.tsv \
229+
--minClip $min_clip \
230+
--minGap $min_gap \
231+
--minMatch $min_match \
232+
--log $output.dir/annotate.log > $output.vcf
233+
""", "annotate_contigs"
234+
}
235+
}
236+
237+
refine_contigs = {
238+
def sample_name = branch.name
239+
output.dir = sample_name
240+
produce("novel_contigs.vcf", "novel_contigs_info.tsv", "novel_contigs.bam", "novel_contigs.fasta"){
241+
exec """
242+
$python ${code_base}/annotate/refine_annotations.py \
243+
$input.tsv $input.vcf $input.bam $tx_annotation \
244+
$genome_fasta $output.prefix \
245+
--minClip $min_clip \
246+
--minGap $min_gap \
247+
--mismatches $splice_motif_mismatch \
248+
--log $output.dir/refine.log > $output.vcf ;
249+
$samtools index $output.bam ;
250+
$python $code_base/util/filter_fasta.py $input.fasta $output.tsv --col_id contig_id > $output.fasta ;
251+
""", "refine_contigs"
252+
}
253+
}
254+
255+
calculate_VAF = {
256+
output.dir = branch.name
257+
produce("vaf_estimates.txt"){
258+
exec """
259+
${R}script ${code_base}/annotate/estimate_VAF.R $branch.name/ec_count_matrix.txt $branch.name/salmon_out/quant.sf $input.tsv $trans_fasta $tx2gene $output
260+
""", "calculate_VAF"
261+
}
262+
}
263+
264+
post_process = {
265+
def sample_name = branch.name
266+
def var_filter = var_filter.split(',').join(' ')
267+
def gf_arg = gene_filter == '' ? '' : '--gene_filter ' + gene_filter
268+
def vf_arg = var_filter == '' ? '' : '--var_filter ' + var_filter
269+
output.dir = sample_name
270+
produce(sample_name + '_results.tsv'){
271+
exec """
272+
$python ${code_base}/annotate/post_process.py \
273+
$sample_name \
274+
$input.tsv \
275+
$input.fasta \
276+
$sample_name/eq_classes_de.txt \
277+
$sample_name/vaf_estimates.txt \
278+
$gf_arg \
279+
$vf_arg \
280+
--log $output.dir/postprocess.log > $output
281+
"""
282+
}
283+
}
284+
285+
sort_and_index_bam = {
286+
output.dir = new File(input.sam).getParentFile()
287+
transform('sam') to ('bam') {
288+
exec """
289+
$samtools sort -@ $threads -m ${sort_ram} $input.sam -o $output ;
290+
$samtools index $output
291+
""", "sort_and_index_bam"
292+
}
293+
}
294+
295+
run { fastqCaseFormat * [ fastq_dedupe +
296+
trim +
297+
assemble +
298+
create_salmon_index +
299+
[ fastqCaseFormat * [ run_salmon.using(type: "case") ],
300+
fastqControlFormat * [ run_salmon.using(type:"controls") ] ] +
301+
create_ec_count_matrix +
302+
run_de +
303+
filter_on_significant_ecs +
304+
align_contigs_against_genome +
305+
sort_and_index_bam +
306+
annotate_contigs +
307+
refine_contigs +
308+
calculate_VAF +
309+
post_process ]
310+
}

mintie

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ do
2626
echo -e "\nusage (setup references): mintie -r "
2727
echo -e "\nusage (setup test data): mintie -t "
2828
echo -e "\nusage (wrapper): mintie -w -p [params.txt] cases/*.fastq.gz controls/*.fastq.gz "
29-
echo -e "\nusage (direct):\n export \$MINTIEDIR=$MINTIE_HOME;\n bpipe run -@$MINTIEDIR/params.txt [ <other bpipe options >] \n\t \$MINTIEDIR/MINTIE.groovy cases/*.fastq.gz controls/*fastq.gz"
29+
echo -e "\nusage (direct):\n export \$MINTIEDIR=$MINTIE_HOME;\n bpipe run -@\$MINTIEDIR/params.txt [ <other bpipe options >] \n\t \$MINTIEDIR/MINTIE.groovy cases/*.fastq.gz controls/*fastq.gz"
30+
echo -e "\nusage (direct single-end):\n export \$MINTIEDIR=$MINTIE_HOME;\n bpipe run -@\$MINTIEDIR/params.txt [ <other bpipe options >] \n\t \$MINTIEDIR/MINTIE_SE.groovy cases/*.fastq.gz controls/*fastq.gz"
3031
echo ""
3132
exit 0
3233
shift

test/run_test_se.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/sh
2+
3+
# modify reference to control fasta
4+
grep -v allvars-control references.groovy > backup_references.groovy
5+
echo "trans_fasta=\"$PWD/test/data/allvars-control.fasta\"" > tmp.txt
6+
cat backup_references.groovy tmp.txt > references.groovy ; rm tmp.txt
7+
8+
cases=`ls test/data/cases/*gz | grep R1`
9+
controls=`ls test/data/controls/*gz | grep R1`
10+
bpipe @test/test_params.txt MINTIE_SE.groovy $cases $controls
11+
12+
# restore original references
13+
mv backup_references.groovy references.groovy

0 commit comments

Comments
 (0)