|
| 1 | +/* |
| 2 | + __ __ ___ _ _ _____ ___ _____ |
| 3 | + | \/ |_ _| \ | |_ _|_ _| ____| |
| 4 | + | |\/| || || \| | | | | || _| |
| 5 | + | | | || || |\ | | | | || |___ |
| 6 | + |_| |_|___|_| \_| |_| |___|_____| |
| 7 | +
|
| 8 | +Method for Inferring Novel Transcripts and Isoforms using Equivalences classes |
| 9 | +
|
| 10 | +Author: Marek Cmero |
| 11 | +*/ |
| 12 | + |
| 13 | +code_base = file(bpipe.Config.config.script).parentFile.absolutePath |
| 14 | +load code_base + "/tools.groovy" |
| 15 | +load code_base + "/references.groovy" |
| 16 | + |
| 17 | +// initialise defaults if not provided |
| 18 | +if(!binding.variables.containsKey("fastqCaseFormat")){ |
| 19 | + fastqCaseFormat="cases/%_R*.fastq.gz" |
| 20 | +} |
| 21 | +if(!binding.variables.containsKey("fastqControlFormat")){ |
| 22 | + fastqControlFormat="controls/%_R*.fastq.gz" |
| 23 | +} |
| 24 | +if(!binding.variables.containsKey("assemblyFasta")){ |
| 25 | + assemblyFasta="" |
| 26 | +} |
| 27 | +if(!binding.variables.containsKey("run_de_step")){ |
| 28 | + run_de_step="true" |
| 29 | +} |
| 30 | +if(!binding.variables.containsKey("splice_motif_mismatch")){ |
| 31 | + splice_motif_mismatch=0 |
| 32 | +} |
| 33 | + |
| 34 | +fastq_dedupe = { |
| 35 | + from("*.gz"){ |
| 36 | + def sample_name = branch.name |
| 37 | + output.dir = sample_name |
| 38 | + produce(sample_name+'.1.fastq.gz'){ |
| 39 | + exec """ |
| 40 | + $dedupe $input.gz $output |
| 41 | + """, "fastq_dedupe" |
| 42 | + } |
| 43 | + } |
| 44 | +} |
| 45 | + |
| 46 | +trim = { |
| 47 | + output.dir = branch.name |
| 48 | + produce('trim1.fastq.gz') { |
| 49 | + if (assemblyFasta != '') { |
| 50 | + // no need to trim if assembly provided |
| 51 | + exec """ |
| 52 | + touch $output1 ; touch $output2 |
| 53 | + """ |
| 54 | + } else { |
| 55 | + exec """ |
| 56 | + $trimmomatic SE -threads $threads -phred$scores $input1 |
| 57 | + $output1.prefix |
| 58 | + LEADING:$minQScore TRAILING:$minQScore MINLEN:$min_read_length ; |
| 59 | + gzip $output1.prefix ; |
| 60 | + """, "trim" |
| 61 | + } |
| 62 | + } |
| 63 | +} |
| 64 | + |
| 65 | +assemble = { |
| 66 | + def sample_name = branch.name |
| 67 | + def Ks_for_soap = Ks.toString().contains(',') ? Ks.split(',').join(' ') : Ks |
| 68 | + output.dir = sample_name |
| 69 | + produce(sample_name + '_denovo_filt.fasta'){ |
| 70 | + if (assemblyFasta != '') { |
| 71 | + exec """ |
| 72 | + ln -s $assemblyFasta $output ; |
| 73 | + """ |
| 74 | + } else if (assembler.toLowerCase() == 'trinity') { |
| 75 | + exec """ |
| 76 | + $Trinity --seqType fq --max_memory ${assembly_mem}G --output $sample_name/trinity_assembly \ |
| 77 | + --left $input1 --right $input2 --CPU $threads ; |
| 78 | + ln -s trinity_assembly/Trinity.fasta $output ; |
| 79 | + """, "assemble" |
| 80 | + } else if (assembler.toLowerCase() == 'spades') { |
| 81 | + exec """ |
| 82 | + $rnaspades -1 $input1 -2 $input2 -k $Ks -t $threads -m $assembly_mem -o $sample_name/SPAdes_assembly ; |
| 83 | + ln -s SPAdes_assembly/contigs.fasta $output ; |
| 84 | + """, "assemble" |
| 85 | + } else { |
| 86 | + exec """ |
| 87 | + rlens=`gunzip -c $input1 \ |
| 88 | + | awk -v mrl=$min_read_length 'BEGIN {minlen = mrl; maxlen = 0} { |
| 89 | + if (NR % 4 == 2) { |
| 90 | + rlen = length(\$1) ; |
| 91 | + if (rlen > maxlen) {maxlen = rlen} |
| 92 | + if (rlen < minlen) {minlen = rlen} |
| 93 | + }} END {print minlen" "maxlen}'` ; |
| 94 | + min_rlen=\${rlens% *} ; |
| 95 | + max_rlen=\${rlens#* } ; |
| 96 | +
|
| 97 | + if [ ! -d $output.dir/SOAPassembly ]; then |
| 98 | + mkdir $output.dir/SOAPassembly ; |
| 99 | + fi ; |
| 100 | + cd $output.dir/SOAPassembly ; |
| 101 | +
|
| 102 | + echo \"max_rd_len=\$max_rlen\" > config.config ; |
| 103 | + echo -e \"[LIB]\\nq=../../$input1\" >> config.config ; |
| 104 | + if [ -e SOAP.fasta ]; then rm SOAP.fasta ; fi ; |
| 105 | + for k in $Ks_for_soap ; do |
| 106 | + if [ \$k -gt \$min_rlen ]; then |
| 107 | + echo "WARNING: Kmer size \$k exceeds minimum read length \${min_rlen}. Please double check parameters." ; |
| 108 | + else |
| 109 | + $soapdenovotrans pregraph -s config.config -o outputGraph_\$k -K \$k -p $threads ; |
| 110 | + $soapdenovotrans contig -g outputGraph_\$k ; |
| 111 | + cat outputGraph_\$k.contig | sed "s/^>/>k\${k}_/g" >> SOAP.fasta ; |
| 112 | + fi ; |
| 113 | + done ; |
| 114 | +
|
| 115 | + cd ../../ ; |
| 116 | + $dedupe in=$sample_name/SOAPassembly/SOAP.fasta out=stdout.fa threads=$threads overwrite=true | \ |
| 117 | + $fasta_formatter | \ |
| 118 | + awk '!/^>/ { next } { getline seq } length(seq) > $min_contig_len { print \$0 "\\n" seq }' > $output ; |
| 119 | + if [ ! -s $output ] ; then |
| 120 | + rm $output ; |
| 121 | + echo "ERROR: de novo assembled contigs fasta file is empty." ; |
| 122 | + echo "Please check paths for SOAPdenovoTrans, dedupe and fasta" ; |
| 123 | + echo "formatter are correct, and their dependencies are installed." ; |
| 124 | + fi ; |
| 125 | + """, "assemble" |
| 126 | + } |
| 127 | + } |
| 128 | +} |
| 129 | + |
| 130 | +create_salmon_index = { |
| 131 | + def sample_name = branch.name |
| 132 | + def salmon_index = sample_name + "/all_fasta_index" |
| 133 | + output.dir = salmon_index |
| 134 | + def index_fasta = output.dir + "/" + sample_name + ".fasta" |
| 135 | + produce(index_fasta, '*.bin'){ |
| 136 | + exec """ |
| 137 | + cat $trans_fasta $input.fasta > $output1 ; |
| 138 | + $salmon index -t $output1 -i $salmon_index -p $threads ; |
| 139 | + """, "create_salmon_index" |
| 140 | + } |
| 141 | +} |
| 142 | + |
| 143 | +run_salmon = { |
| 144 | + def workingDir = System.getProperty("user.dir"); |
| 145 | + def rf = inputs.split().collect { workingDir+"/$it" }[0] |
| 146 | + def salmon_index="all_fasta_index" |
| 147 | + def base_outdir = "salmon_out" |
| 148 | + def controls_dir = fastqControlFormat.split("/")[-2] |
| 149 | + def sample_name = branch.name |
| 150 | + |
| 151 | + if(type == "controls"){ |
| 152 | + sample_name = branch.parent.parent.name |
| 153 | + def control_name = branch.name |
| 154 | + |
| 155 | + output.dir = sample_name + "/" + controls_dir + "/" + control_name + "_salmon_out/aux_info" |
| 156 | + base_outdir = control_name + "_salmon_out" |
| 157 | + salmon_index = "../all_fasta_index" |
| 158 | + } else { |
| 159 | + output.dir = sample_name + "/salmon_out/aux_info" |
| 160 | + } |
| 161 | + |
| 162 | + produce("eq_classes.txt*"){ |
| 163 | + exec """ |
| 164 | + cd $output.dir/../.. ; |
| 165 | + $salmon quant --dumpEq --seqBias --validateMappings --hardFilter -i $salmon_index -l A -r $rf -p $threads -o $base_outdir |
| 166 | + """, "run_salmon" |
| 167 | + } |
| 168 | +} |
| 169 | + |
| 170 | +create_ec_count_matrix = { |
| 171 | + def sample_name = branch.name |
| 172 | + def sample_names = inputs.split().collect { it.split('/')[-3].split('_salmon_out')[0] } |
| 173 | + sample_names.set(0, sample_name) // case sample, rest are controls |
| 174 | + sample_names = sample_names.join(',') |
| 175 | + output.dir = sample_name |
| 176 | + produce("ec_count_matrix.txt"){ |
| 177 | + exec """ |
| 178 | + $python $code_base/DE/create_ec_count_matrix.py $inputs $sample_names $output1 ; |
| 179 | + """, "create_ec_count_matrix" |
| 180 | + } |
| 181 | +} |
| 182 | + |
| 183 | +run_de = { |
| 184 | + def run_de_bool = run_de_step.toBoolean() |
| 185 | + def sample_name = branch.name |
| 186 | + output.dir = sample_name |
| 187 | + produce("eq_classes_de.txt"){ |
| 188 | + if(run_de_bool) { |
| 189 | + exec """ |
| 190 | + ${R}script $code_base/DE/compare_eq_classes.R $sample_name $input $trans_fasta $output --FDR=$fdr --minCPM=$min_cpm --minLogFC=$min_logfc |
| 191 | + """, "run_de" |
| 192 | + } else { |
| 193 | + exec """ |
| 194 | + $python $code_base/DE/get_novel_contigs.py $input $trans_fasta $output.dir/${sample_name}_denovo_filt.fasta |
| 195 | + """, "run_de" |
| 196 | + } |
| 197 | + } |
| 198 | +} |
| 199 | + |
| 200 | +filter_on_significant_ecs = { |
| 201 | + def sample_name = branch.name |
| 202 | + output.dir = sample_name |
| 203 | + produce("de_contigs.fasta"){ |
| 204 | + exec """ |
| 205 | + $python $code_base/util/filter_fasta.py $output.dir/${sample_name}_denovo_filt.fasta $input.txt --col_id contig > $output1 ; |
| 206 | + """, "filter_sig_ecs" |
| 207 | + } |
| 208 | +} |
| 209 | + |
| 210 | +align_contigs_against_genome = { |
| 211 | + def sample_name = branch.name |
| 212 | + output.dir = sample_name |
| 213 | + produce('aligned_contigs_against_genome.sam'){ |
| 214 | + exec """ |
| 215 | + $gmap -D $gmap_refdir -d $gmap_genome -f samse -t $threads -x $min_gap --max-intronlength-ends=500000 -n 0 $input.fasta > $output |
| 216 | + """, "align_contigs_against_genome" |
| 217 | + } |
| 218 | +} |
| 219 | + |
| 220 | +annotate_contigs = { |
| 221 | + def sample_name = branch.name |
| 222 | + output.dir = sample_name |
| 223 | + produce("annotated_contigs.vcf", "annotated_contigs_info.tsv", "annotated_contigs.bam"){ |
| 224 | + exec """ |
| 225 | + $python ${code_base}/annotate/annotate_contigs.py \ |
| 226 | + $sample_name $input.bam \ |
| 227 | + $ann_info $tx_annotation \ |
| 228 | + $output.bam $output.tsv \ |
| 229 | + --minClip $min_clip \ |
| 230 | + --minGap $min_gap \ |
| 231 | + --minMatch $min_match \ |
| 232 | + --log $output.dir/annotate.log > $output.vcf |
| 233 | + """, "annotate_contigs" |
| 234 | + } |
| 235 | +} |
| 236 | + |
| 237 | +refine_contigs = { |
| 238 | + def sample_name = branch.name |
| 239 | + output.dir = sample_name |
| 240 | + produce("novel_contigs.vcf", "novel_contigs_info.tsv", "novel_contigs.bam", "novel_contigs.fasta"){ |
| 241 | + exec """ |
| 242 | + $python ${code_base}/annotate/refine_annotations.py \ |
| 243 | + $input.tsv $input.vcf $input.bam $tx_annotation \ |
| 244 | + $genome_fasta $output.prefix \ |
| 245 | + --minClip $min_clip \ |
| 246 | + --minGap $min_gap \ |
| 247 | + --mismatches $splice_motif_mismatch \ |
| 248 | + --log $output.dir/refine.log > $output.vcf ; |
| 249 | + $samtools index $output.bam ; |
| 250 | + $python $code_base/util/filter_fasta.py $input.fasta $output.tsv --col_id contig_id > $output.fasta ; |
| 251 | + """, "refine_contigs" |
| 252 | + } |
| 253 | +} |
| 254 | + |
| 255 | +calculate_VAF = { |
| 256 | + output.dir = branch.name |
| 257 | + produce("vaf_estimates.txt"){ |
| 258 | + exec """ |
| 259 | + ${R}script ${code_base}/annotate/estimate_VAF.R $branch.name/ec_count_matrix.txt $branch.name/salmon_out/quant.sf $input.tsv $trans_fasta $tx2gene $output |
| 260 | + """, "calculate_VAF" |
| 261 | + } |
| 262 | +} |
| 263 | + |
| 264 | +post_process = { |
| 265 | + def sample_name = branch.name |
| 266 | + def var_filter = var_filter.split(',').join(' ') |
| 267 | + def gf_arg = gene_filter == '' ? '' : '--gene_filter ' + gene_filter |
| 268 | + def vf_arg = var_filter == '' ? '' : '--var_filter ' + var_filter |
| 269 | + output.dir = sample_name |
| 270 | + produce(sample_name + '_results.tsv'){ |
| 271 | + exec """ |
| 272 | + $python ${code_base}/annotate/post_process.py \ |
| 273 | + $sample_name \ |
| 274 | + $input.tsv \ |
| 275 | + $input.fasta \ |
| 276 | + $sample_name/eq_classes_de.txt \ |
| 277 | + $sample_name/vaf_estimates.txt \ |
| 278 | + $gf_arg \ |
| 279 | + $vf_arg \ |
| 280 | + --log $output.dir/postprocess.log > $output |
| 281 | + """ |
| 282 | + } |
| 283 | +} |
| 284 | + |
| 285 | +sort_and_index_bam = { |
| 286 | + output.dir = new File(input.sam).getParentFile() |
| 287 | + transform('sam') to ('bam') { |
| 288 | + exec """ |
| 289 | + $samtools sort -@ $threads -m ${sort_ram} $input.sam -o $output ; |
| 290 | + $samtools index $output |
| 291 | + """, "sort_and_index_bam" |
| 292 | + } |
| 293 | +} |
| 294 | + |
| 295 | +run { fastqCaseFormat * [ fastq_dedupe + |
| 296 | + trim + |
| 297 | + assemble + |
| 298 | + create_salmon_index + |
| 299 | + [ fastqCaseFormat * [ run_salmon.using(type: "case") ], |
| 300 | + fastqControlFormat * [ run_salmon.using(type:"controls") ] ] + |
| 301 | + create_ec_count_matrix + |
| 302 | + run_de + |
| 303 | + filter_on_significant_ecs + |
| 304 | + align_contigs_against_genome + |
| 305 | + sort_and_index_bam + |
| 306 | + annotate_contigs + |
| 307 | + refine_contigs + |
| 308 | + calculate_VAF + |
| 309 | + post_process ] |
| 310 | +} |
0 commit comments