Skip to content

Commit 5658bc8

Browse files
authored
Merge pull request #129 from PMCC-BioinformaticsCore/release-v0.12.2
Release v0.12.2
2 parents 0b26db8 + 02862f6 commit 5658bc8

File tree

5 files changed

+350
-4
lines changed

5 files changed

+350
-4
lines changed

janis_bioinformatics/__meta__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
__version__ = "v0.12.1"
1+
__version__ = "v0.12.2"
22
description = "Bioinformatics tools for Janis; the Pipeline creation helper"
Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
from datetime import datetime
2+
3+
from janis_core import (
4+
File,
5+
String,
6+
Array,
7+
InputSelector,
8+
WorkflowMetadata,
9+
ScatterDescription,
10+
ScatterMethods,
11+
InputDocumentation,
12+
InputQualityType,
13+
Int,
14+
)
15+
from janis_unix.data_types import TextFile
16+
from janis_unix.tools import UncompressArchive
17+
from janis_bioinformatics.data_types import (
18+
FastaWithDict,
19+
VcfTabix,
20+
FastqGzPair,
21+
Bed,
22+
Bam,
23+
BamBai,
24+
)
25+
from janis_bioinformatics.tools.bioinformaticstoolbase import BioinformaticsWorkflow
26+
from janis_bioinformatics.tools.babrahambioinformatics import FastQC_0_11_5
27+
from janis_bioinformatics.tools.bcftools import BcfToolsSort_1_9
28+
from janis_bioinformatics.tools.common import (
29+
BwaAligner,
30+
MergeAndMarkBams_4_1_3,
31+
GATKBaseRecalBQSRWorkflow_4_1_3,
32+
SplitMultiAlleleNormaliseVcf,
33+
)
34+
from janis_bioinformatics.tools.gatk4 import Gatk4HaplotypeCaller_4_1_3
35+
from janis_bioinformatics.tools.htslib import BGZip_1_9, TabixLatest
36+
from janis_bioinformatics.tools.papenfuss import Gridss_2_6_2
37+
from janis_bioinformatics.tools.pmac import (
38+
ParseFastqcAdaptors,
39+
AnnotateDepthOfCoverage_0_1_0,
40+
PerformanceSummaryTargeted_0_1_0,
41+
CombineVariants_0_0_8,
42+
AddBamStatsGermline_0_1_0,
43+
)
44+
from janis_bioinformatics.tools.variantcallers import (
45+
GatkSomaticVariantCallerTumorOnlyTargeted,
46+
)
47+
from janis_bioinformatics.tools.vcflib import VcfLength_1_0_1, VcfFilter_1_0_1
48+
from janis_bioinformatics.tools.igvtools import IgvIndexFeature_2_5_3
49+
50+
# from janis_molpath.tools.pathos import NormaliseVcf_1_5_4, Vcf2Tsv_1_5_4
51+
# from janis_molpath.tools.scripts.postgridss import GRIDSSProcessOutput
52+
53+
54+
class MolpathTumorOnly_1_0_0(BioinformaticsWorkflow):
55+
def id(self):
56+
return "MolpathTumorOnlyWorkflow"
57+
58+
def friendly_name(self):
59+
return "Molpath Tumor Only Workflow"
60+
61+
def tool_provider(self):
62+
return "Peter MacCallum Cancer Centre"
63+
64+
def bind_metadata(self):
65+
return WorkflowMetadata(
66+
version="v1.0.0",
67+
contributors=["Jiaan Yu"],
68+
dateCreated=datetime(2020, 6, 12),
69+
dateUpdated=datetime(2020, 8, 10),
70+
)
71+
72+
def constructor(self):
73+
74+
# Inputs
75+
self.input("sample_name", String)
76+
self.input("fastqs", Array(FastqGzPair))
77+
self.input("seqrun", String, doc="SeqRun Name (for Vcf2Tsv)")
78+
self.input("reference", FastaWithDict)
79+
self.input("region_bed", Bed)
80+
self.input("region_bed_extended", Bed)
81+
self.input("region_bed_annotated", Bed)
82+
self.input("genecoverage_bed", Bed)
83+
self.input("genome_file", TextFile)
84+
self.input("panel_name", String)
85+
self.input("vcfcols", TextFile)
86+
self.input("black_list", Bed(optional=True))
87+
self.input("snps_dbsnp", VcfTabix)
88+
self.input("snps_1000gp", VcfTabix)
89+
self.input("known_indels", VcfTabix)
90+
self.input("mills_indels", VcfTabix)
91+
self.input("mutalyzer_server", String)
92+
self.input("pathos_db", String)
93+
self.input("maxRecordsInRam", Int)
94+
# tumor only
95+
self.input("gnomad", VcfTabix)
96+
self.input("panel_of_normals", VcfTabix(optional=True))
97+
98+
# fastqc
99+
self.step(
100+
"fastqc", FastQC_0_11_5(reads=self.fastqs, threads=4), scatter="reads"
101+
)
102+
# get the overrepresentative sequence from fastqc
103+
self.step(
104+
"getfastqc_adapters",
105+
ParseFastqcAdaptors(fastqc_datafiles=self.fastqc.datafile,),
106+
scatter="fastqc_datafiles",
107+
)
108+
# align and generate sorted index bam
109+
self.step(
110+
"align_and_sort",
111+
BwaAligner(
112+
fastq=self.fastqs,
113+
reference=self.reference,
114+
sample_name=self.sample_name,
115+
sortsam_tmpDir=".",
116+
cutadapt_adapter=self.getfastqc_adapters,
117+
cutadapt_removeMiddle3Adapter=self.getfastqc_adapters,
118+
),
119+
scatter=["fastq", "cutadapt_adapter", "cutadapt_removeMiddle3Adapter"],
120+
)
121+
# merge into one bam and markdups
122+
self.step(
123+
"merge_and_mark",
124+
MergeAndMarkBams_4_1_3(
125+
bams=self.align_and_sort.out,
126+
sampleName=self.sample_name,
127+
maxRecordsInRam=self.maxRecordsInRam,
128+
),
129+
)
130+
# performance: doc
131+
self.step(
132+
"annotate_doc",
133+
AnnotateDepthOfCoverage_0_1_0(
134+
bam=self.merge_and_mark.out,
135+
bed=self.region_bed_annotated,
136+
reference=self.reference,
137+
sample_name=self.sample_name,
138+
),
139+
)
140+
141+
# performance
142+
self.step(
143+
"performance_summary",
144+
PerformanceSummaryTargeted_0_1_0(
145+
bam=self.merge_and_mark.out,
146+
region_bed=self.region_bed,
147+
genecoverage_bed=self.genecoverage_bed,
148+
sample_name=self.sample_name,
149+
genome_file=self.genome_file,
150+
),
151+
)
152+
# gridss
153+
self.step(
154+
"gridss",
155+
Gridss_2_6_2(
156+
bams=self.merge_and_mark.out,
157+
reference=self.reference,
158+
blacklist=self.black_list,
159+
tmpdir=".",
160+
),
161+
)
162+
# post gridss r for tumor only + tumor only mode
163+
# self.step("gridss_post_r", GRIDSSProcessOutput(inp=self.gridss.out))
164+
# gatk bqsr bam
165+
self.step(
166+
"bqsr",
167+
GATKBaseRecalBQSRWorkflow_4_1_3(
168+
bam=self.merge_and_mark.out,
169+
intervals=self.region_bed_extended,
170+
reference=self.reference,
171+
snps_dbsnp=self.snps_dbsnp,
172+
snps_1000gp=self.snps_1000gp,
173+
known_indels=self.known_indels,
174+
mills_indels=self.mills_indels,
175+
),
176+
)
177+
# mutect2
178+
self.step(
179+
"mutect2",
180+
GatkSomaticVariantCallerTumorOnlyTargeted(
181+
bam=self.bqsr.out,
182+
intervals=self.region_bed_extended,
183+
reference=self.reference,
184+
gnomad=self.gnomad,
185+
panel_of_normals=self.panel_of_normals,
186+
),
187+
)
188+
# haplotypecaller to do: take base recal away from the
189+
self.step(
190+
"haplotype_caller",
191+
Gatk4HaplotypeCaller_4_1_3(
192+
inputRead=self.bqsr.out,
193+
intervals=self.region_bed_extended,
194+
reference=self.reference,
195+
dbsnp=self.snps_dbsnp,
196+
pairHmmImplementation="LOGLESS_CACHING",
197+
),
198+
)
199+
self.step(
200+
"splitnormalisevcf",
201+
SplitMultiAlleleNormaliseVcf(
202+
compressedVcf=self.haplotype_caller.out, reference=self.reference
203+
),
204+
)
205+
# combine variants
206+
self.step(
207+
"combinevariants",
208+
CombineVariants_0_0_8(
209+
vcfs=[self.splitnormalisevcf.out, self.mutect2.out],
210+
type="germline",
211+
columns=["AD", "DP", "AF", "GT"],
212+
),
213+
)
214+
self.step("compressvcf", BGZip_1_9(file=self.combinevariants.out))
215+
self.step("sortvcf", BcfToolsSort_1_9(vcf=self.compressvcf.out))
216+
self.step("uncompressvcf", UncompressArchive(file=self.sortvcf.out, force=True))
217+
# addbamstats
218+
self.step(
219+
"addbamstats",
220+
AddBamStatsGermline_0_1_0(
221+
bam=self.merge_and_mark.out,
222+
vcf=self.uncompressvcf.out,
223+
reference=self.reference,
224+
),
225+
)
226+
# Molpath specific processes
227+
self.step("compressvcf2", BGZip_1_9(file=self.addbamstats.out))
228+
self.step("tabixvcf", TabixLatest(inp=self.compressvcf2.out))
229+
self.step(
230+
"calculate_variant_length",
231+
VcfLength_1_0_1(vcf=self.tabixvcf.out),
232+
doc="Add the length column for the output of AddBamStats",
233+
)
234+
235+
filter_for_variants = self.input("filter_for_vcfs", str, default="length > 150")
236+
self.step(
237+
"filter_variants_1_failed",
238+
VcfFilter_1_0_1(
239+
vcf=self.calculate_variant_length.out, info_filter=filter_for_variants
240+
),
241+
)
242+
self.step(
243+
"filter_variants_1",
244+
VcfFilter_1_0_1(
245+
vcf=self.calculate_variant_length.out,
246+
info_filter=filter_for_variants,
247+
invert=True, # -v param
248+
),
249+
)
250+
251+
# Jiaan: copy over from the FRCP, can take the block comment out
252+
# # This one is the in-house molpath step
253+
# self.step(
254+
# "normalise_vcfs",
255+
# NormaliseVcf_1_5_4(
256+
# pathos_version=self.pathos_db,
257+
# mutalyzer=self.mutalyzer_server, # mutalyzer="https://vmpr-res-mutalyzer1.unix.petermac.org.au",
258+
# rdb=self.pathos_db, # rdb="pa_uat",
259+
# inp=self.filter_variants_1.out,
260+
# ),
261+
# )
262+
263+
# # repeat remove 150bp variants (workaround for normalise_vcf bug)
264+
# self.step(
265+
# "filter_variants_2_failed",
266+
# VcfFilter_1_0_1(
267+
# vcf=self.normalise_vcfs.out, info_filter=filter_for_variants
268+
# ),
269+
# )
270+
# self.step(
271+
# "filter_variants_2",
272+
# VcfFilter_1_0_1(
273+
# vcf=self.normalise_vcfs.out,
274+
# info_filter=filter_for_variants,
275+
# invert=True, # -v param
276+
# ),
277+
# )
278+
279+
# self.step(
280+
# "convert_to_tsv",
281+
# Vcf2Tsv_1_5_4(
282+
# pathos_version=self.pathos_db,
283+
# inp=self.filter_variants_2.out,
284+
# sample=self.sample_name,
285+
# columns=self.vcfcols,
286+
# seqrun=self.seqrun,
287+
# ),
288+
# )
289+
290+
# self.step(
291+
# "index_with_igvtools", IgvIndexFeature_2_5_3(inp=self.filter_variants_2.out)
292+
# )
293+
294+
# output
295+
self.output("fastq_qc", source=self.fastqc.out, output_folder="QC")
296+
297+
self.output("markdups_bam", source=self.merge_and_mark.out, output_folder="BAM")
298+
299+
self.output(
300+
"doc_out", source=self.annotate_doc.out, output_folder="PERFORMANCE"
301+
)
302+
self.output(
303+
"summary", source=self.performance_summary.out, output_folder="PERFORMANCE"
304+
)
305+
self.output(
306+
"gene_summary",
307+
source=self.performance_summary.geneFileOut,
308+
output_folder="PERFORMANCE",
309+
)
310+
self.output(
311+
"region_summary",
312+
source=self.performance_summary.regionFileOut,
313+
output_folder="PERFORMANCE",
314+
)
315+
316+
self.output("gridss_vcf", source=self.gridss.out, output_folder="SV")
317+
self.output("gridss_bam", source=self.gridss.assembly, output_folder="SV")
318+
319+
self.output(
320+
"haplotypecaller_vcf",
321+
source=self.haplotype_caller.out,
322+
output_folder="VCF",
323+
)
324+
self.output(
325+
"haplotypecaller_bam",
326+
source=self.haplotype_caller.bam,
327+
output_folder="VCF",
328+
)
329+
self.output(
330+
"haplotypecaller_norm",
331+
source=self.splitnormalisevcf.out,
332+
output_folder="VCF",
333+
)
334+
self.output("mutect2_vcf", source=self.mutect2.variants, output_folder="VCF")
335+
self.output("mutect2_bam", source=self.mutect2.out_bam, output_folder="VCF")
336+
self.output("mutect2_norm", source=self.mutect2.out, output_folder="VCF")
337+
self.output("addbamstats_vcf", source=self.addbamstats.out)
338+
# what more output to save?
339+
# self.output("final_vcf", source=self.filter_variants_2.out)
340+
# self.output("tsv", source=self.convert_to_tsv.out)

janis_bioinformatics/tools/pmac/performancesummary/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def inputs(self):
4646
),
4747
ToolInput(
4848
"outputPrefix",
49-
Filename(extension=".csv"),
49+
Filename(),
5050
prefix="-o",
5151
doc="prefix of output summary csv",
5252
),

janis_bioinformatics/tools/variantcallers/gatk/gatkgermline_variants_4_1_3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def constructor(self):
6363
pairHmmImplementation="LOGLESS_CACHING",
6464
),
6565
)
66-
self.step("uncompressvcf", UncompressArchive(file=self.haplotype_caller.out))
66+
self.step("uncompressvcf", UncompressArchive(file=self.haplotype_caller.out, force=True))
6767
self.step(
6868
"splitnormalisevcf",
6969
SplitMultiAllele(

janis_bioinformatics/tools/variantcallers/gatk/gatksomatic_variants_4_1_3.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,13 @@ def constructor(self):
9898
)
9999

100100
# normalise and filter "PASS" variants
101-
self.step("uncompressvcf", UncompressArchive(file=self.filtermutect2calls.out))
101+
self.step(
102+
"uncompressvcf",
103+
UncompressArchive(
104+
file=self.filtermutect2calls.out,
105+
force=True
106+
)
107+
)
102108
self.step(
103109
"splitnormalisevcf",
104110
SplitMultiAllele(

0 commit comments

Comments
 (0)