Skip to content

Commit 0ae4c58

Browse files
authored
Merge pull request #118 from martinghunt/gvcf_add_depths
Add tag DP_ACGT to GVCF
2 parents dc15413 + f8961ba commit 0ae4c58

File tree

5 files changed

+112
-44
lines changed

5 files changed

+112
-44
lines changed

python/clockwork/gvcf.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
from collections import Counter
12
import datetime
3+
import logging
4+
import subprocess
25

36
from cluster_vcf_records import vcf_file_read, vcf_record
47
import pyfastaq
@@ -15,6 +18,7 @@ def _combine_minos_and_samtools_header(minos_header, samtools_header, ref_seqs):
1518
new_header = [
1619
'##FILTER=<ID=NO_DATA,Description="No information from minos or samtools">',
1720
'##INFO=<ID=CALLER,Number=1,Description="Origin of call, one of minos, samtools, or none if there was no depth">',
21+
'##FORMAT=<ID=DP_ACGT,Number=8,Type=Integer,Description="Number of A-forward, A-reverse, C-forward, C-reverse, G-forward, G-reverse, T-forward, T-reverse bases">',
1822
]
1923
new_header.extend(
2024
[f"##contig=<ID={k},length={len(v)}>" for k, v in ref_seqs.items()]
@@ -126,16 +130,61 @@ def _finish_contig(ref_pos, ref_seq, minos_record, minos_iter, filehandle):
126130
ref_pos += 1
127131

128132

129-
def gvcf_from_minos_vcf_and_samtools_gvcf(ref_fasta, minos_vcf, samtools_vcf, out_vcf):
133+
def _samtools_acgt_depths(bam_file):
134+
command = f"samtools mpileup --no-output-ins --no-output-ins --no-output-del --no-output-del --no-output-ends -aa {bam_file}"
135+
logging.info(f"Gathering per-position A/C/G/T depths ({command})")
136+
p = subprocess.Popen(
137+
command, shell=True, stdout=subprocess.PIPE, universal_newlines=True
138+
)
139+
acgt_depths = {}
140+
acgt = ("A", "a", "C", "c", "G", "g", "T", "t")
141+
142+
for line in p.stdout:
143+
ref, pos, _, _, depths, _ = line.split("\t")
144+
pos = int(pos) - 1
145+
if ref not in acgt_depths:
146+
acgt_depths[ref] = {}
147+
assert pos not in acgt_depths[ref]
148+
counts = Counter(depths)
149+
acgt_depths[ref][pos] = ",".join(str(counts[x]) for x in acgt)
150+
151+
p.wait()
152+
if p.returncode != 0:
153+
raise Exception(f"Error running samtools mpileup: {command}")
154+
155+
logging.info(f"Finished per-position A/C/G/T depths ({command})")
156+
return acgt_depths
157+
158+
159+
def _add_acgt_depths_to_minos(minos_records, acgt_depths):
160+
for ref in minos_records:
161+
if ref not in acgt_depths:
162+
continue
163+
depths = acgt_depths[ref]
164+
165+
for record in minos_records[ref]:
166+
if record.POS in depths:
167+
record.set_format_key_value("DP_ACGT", depths[record.POS])
168+
169+
170+
def gvcf_from_minos_vcf_and_samtools_gvcf(
171+
ref_fasta, minos_vcf, samtools_vcf, out_vcf, bam_file=None
172+
):
130173
minos_header, minos_records = vcf_file_read.vcf_file_to_dict(minos_vcf)
131174
samtools_header = vcf_file_read.get_header_lines_from_vcf_file(samtools_vcf)
175+
if bam_file is not None:
176+
acgt_depths = _samtools_acgt_depths(bam_file)
177+
_add_acgt_depths_to_minos(minos_records, acgt_depths)
178+
else:
179+
acgt_depths = {}
132180

133181
ref_seqs = {}
134182
pyfastaq.tasks.file_to_dict(ref_fasta, ref_seqs)
135183
used_ref_seqs = set()
136184
ref_seq = None
137185
ref_pos = -1
138186
minos_record = None
187+
logging.info(f"Making GVCF file {out_vcf} from {minos_vcf} {samtools_vcf}")
139188

140189
with open(out_vcf, "w") as f_out, open(samtools_vcf) as f_samtools:
141190
print(
@@ -161,6 +210,12 @@ def gvcf_from_minos_vcf_and_samtools_gvcf(ref_fasta, minos_vcf, samtools_vcf, ou
161210
continue
162211

163212
samtools_record = vcf_record.VcfRecord(line)
213+
try:
214+
samtools_record.set_format_key_value(
215+
"DP_ACGT", acgt_depths[samtools_record.CHROM][samtools_record.POS]
216+
)
217+
except KeyError:
218+
pass
164219

165220
# If we've found a new CHROM in the samtools VCF file
166221
if ref_seq is None or ref_seq.id != samtools_record.CHROM:
@@ -203,7 +258,7 @@ def gvcf_from_minos_vcf_and_samtools_gvcf(ref_fasta, minos_vcf, samtools_vcf, ou
203258
print(samtools_record, file=f_out)
204259
ref_pos = samtools_record.POS + 1
205260

206-
if ref_seq is not None: # happens if the samtools VCF was empty
261+
if ref_seq is not None: # happens if the samtools VCF was empty
207262
_finish_contig(ref_pos, ref_seq, minos_record, minos_iter, f_out)
208263
_print_non_samtools_seqs(ref_seqs, used_ref_seqs, minos_records, f_out)
209264

python/clockwork/tasks/gvcf_from_minos_and_samtools.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,9 @@
33

44
def run(options):
55
gvcf.gvcf_from_minos_vcf_and_samtools_gvcf(
6-
options.ref_fasta, options.minos_vcf, options.samtools_vcf, options.outfile,
6+
options.ref_fasta,
7+
options.minos_vcf,
8+
options.samtools_vcf,
9+
options.outfile,
10+
bam_file=options.bam_file,
711
)

python/clockwork/tests/data/gvcf/gvcf_from_minos_vcf_and_samtools_gvcf.out.vcf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total read depth from gramtools">
1717
##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases">
1818
##FORMAT=<ID=DPF,Number=1,Type=Float,Description="Depth Fraction, defined as DP divided by mean depth">
19+
##FORMAT=<ID=DP_ACGT,Number=8,Type=Integer,Description="Number of A-forward, A-reverse, C-forward, C-reverse, G-forward, G-reverse, T-forward, T-reverse bases">
1920
##FORMAT=<ID=FQ,Number=1,Type=Float,Description="Phred probability of all samples being the same">
2021
##FORMAT=<ID=FRS,Number=1,Type=Float,Description="Fraction of reads that support the genotype call">
2122
##FORMAT=<ID=G3,Number=3,Type=Float,Description="ML estimate of genotype frequencies">

python/clockwork/var_call_one_sample_pipeline.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,13 @@ def run(
3434
"Must give same number of forward and reverse reads files. Got:\nForward:{reads1_list}\nReverse:{reads2_list}"
3535
)
3636
if not check_reads_files(reads1_list + reads2_list):
37-
raise Exception("Reads file(s) not found or are empty. See previous error messages. Cannot continue")
37+
raise Exception(
38+
"Reads file(s) not found or are empty. See previous error messages. Cannot continue"
39+
)
3840
if not os.path.exists(ref_dir):
39-
raise FileNotFoundError(f"Reference directory not found: {ref_dir}. Cannot continue")
41+
raise FileNotFoundError(
42+
f"Reference directory not found: {ref_dir}. Cannot continue"
43+
)
4044

4145
os.mkdir(outdir)
4246

@@ -51,10 +55,7 @@ def run(
5155
trimmed_reads_1.append(os.path.join(outdir, f"trimmed_reads.{i}.1.fq.gz"))
5256
trimmed_reads_2.append(os.path.join(outdir, f"trimmed_reads.{i}.2.fq.gz"))
5357
read_trim.run_trimmomatic(
54-
reads1_list[i],
55-
reads2_list[i],
56-
trimmed_reads_1[-1],
57-
trimmed_reads_2[-1],
58+
reads1_list[i], reads2_list[i], trimmed_reads_1[-1], trimmed_reads_2[-1]
5859
)
5960

6061
refdir = reference_dir.ReferenceDir(directory=ref_dir)
@@ -77,7 +78,9 @@ def run(
7778
utils.syscall(cmd)
7879
samtools_vcf_has_vars = utils.vcf_has_records(samtools_vcf)
7980
if not samtools_vcf_has_vars:
80-
logging.warn("SAMTOOLS VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA")
81+
logging.warn(
82+
"SAMTOOLS VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA"
83+
)
8184

8285
samtools_gvcf = os.path.join(outdir, "samtools.gvcf")
8386
cmd = f"bcftools mpileup -I --output-type u -f {refdir.ref_fasta} {rmdup_bam} | bcftools call -c -O v -o {samtools_gvcf}"
@@ -94,7 +97,9 @@ def run(
9497
try:
9598
ctx.run(run_mccortex_view_kmers=False)
9699
except:
97-
logging.error("ERROR RUNNING CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK CORTEX LOGS")
100+
logging.error(
101+
"ERROR RUNNING CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK CORTEX LOGS"
102+
)
98103

99104
ctx_vcf_dir = os.path.join(cortex_dir, "cortex.out", "vcfs")
100105
try:
@@ -108,24 +113,43 @@ def run(
108113

109114
cortex_vcf_has_vars = False
110115
if len(cortex_vcfs) != 1:
111-
logging.error("NO VCF FILE MADE BY CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK THE QUALITY OF INPUT DATA, AND CORTEX LOGS")
116+
logging.error(
117+
"NO VCF FILE MADE BY CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK THE QUALITY OF INPUT DATA, AND CORTEX LOGS"
118+
)
112119
cortex_vcf = ""
113120
else:
114121
cortex_vcf = os.path.join(outdir, "cortex.vcf")
115122
os.rename(cortex_vcfs[0], cortex_vcf)
116123
cortex_vcf_has_vars = utils.vcf_has_records(cortex_vcf)
117124
if not cortex_vcf_has_vars:
118-
logging.warn("CORTEX VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA")
125+
logging.warn(
126+
"CORTEX VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA"
127+
)
119128
if not debug:
120129
utils.syscall(f"rm -rf {cortex_dir}")
121130

122131
final_vcf = os.path.join(outdir, "final.vcf")
123132

124133
if not (samtools_vcf_has_vars and cortex_vcf_has_vars):
125-
logging.error("NO VARIANTS FOUND BY CORTEX OR SAMTOOLS. WRITING HEADER-ONLY FINAL VCF FILE INSTEAD OF RUNNING MINOS")
134+
logging.error(
135+
"NO VARIANTS FOUND BY CORTEX OR SAMTOOLS. WRITING HEADER-ONLY FINAL VCF FILE INSTEAD OF RUNNING MINOS"
136+
)
126137
with open(final_vcf, "w") as f:
127138
print("##fileformat=VCFv4.2", file=f)
128-
print("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "sample", sep="\t", file=f)
139+
print(
140+
"#CHROM",
141+
"POS",
142+
"ID",
143+
"REF",
144+
"ALT",
145+
"QUAL",
146+
"FILTER",
147+
"INFO",
148+
"FORMAT",
149+
"sample",
150+
sep="\t",
151+
file=f,
152+
)
129153
minos_vcf_has_vars = False
130154
else:
131155
minos_dir = os.path.join(outdir, "minos")
@@ -137,7 +161,7 @@ def run(
137161

138162
final_gvcf = os.path.join(outdir, "final.gvcf")
139163
gvcf.gvcf_from_minos_vcf_and_samtools_gvcf(
140-
refdir.ref_fasta, final_vcf, samtools_gvcf, final_gvcf,
164+
refdir.ref_fasta, final_vcf, samtools_gvcf, final_gvcf, bam_file=rmdup_bam
141165
)
142166
if not debug:
143167
os.unlink(samtools_gvcf)

python/scripts/clockwork

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -477,20 +477,17 @@ subparser_gvcf_from_minos_and_samtools = subparsers.add_parser(
477477
)
478478

479479
subparser_gvcf_from_minos_and_samtools.add_argument(
480-
"ref_fasta",
481-
help="Reference genome FASTA file",
480+
"ref_fasta", help="Reference genome FASTA file"
482481
)
483482
subparser_gvcf_from_minos_and_samtools.add_argument(
484-
"minos_vcf",
485-
help="VCF file made by minos",
483+
"minos_vcf", help="VCF file made by minos"
486484
)
487485
subparser_gvcf_from_minos_and_samtools.add_argument(
488-
"samtools_vcf",
489-
help="VCF file made by samtools",
486+
"samtools_vcf", help="VCF file made by samtools"
490487
)
488+
subparser_gvcf_from_minos_and_samtools.add_argument("outfile", help="Output gVCF file")
491489
subparser_gvcf_from_minos_and_samtools.add_argument(
492-
"outfile",
493-
help="Output gVCF file",
490+
"--bam_file", help="BAM file. If used, will add DP_ACGT to VCF"
494491
)
495492
subparser_gvcf_from_minos_and_samtools.set_defaults(
496493
func=clockwork.tasks.gvcf_from_minos_and_samtools.run
@@ -506,13 +503,9 @@ subparser_gvcf_to_fasta = subparsers.add_parser(
506503
)
507504

508505
subparser_gvcf_to_fasta.add_argument(
509-
"gvcf",
510-
help="Input gVCF file made by gvcf_from_minos_and_samtools",
511-
)
512-
subparser_gvcf_to_fasta.add_argument(
513-
"outfile",
514-
help="Output FASTA file",
506+
"gvcf", help="Input gVCF file made by gvcf_from_minos_and_samtools"
515507
)
508+
subparser_gvcf_to_fasta.add_argument("outfile", help="Output FASTA file")
516509
subparser_gvcf_to_fasta.add_argument(
517510
"--ignore_minos_pass",
518511
action="store_true",
@@ -840,9 +833,7 @@ subparser_samtools_qc = subparsers.add_parser(
840833
description="Runs BWA MEM, picard MarkDuplicates, then samtools stats and plots",
841834
)
842835

843-
subparser_samtools_qc.add_argument(
844-
"ref_fasta", help="Name of reference fasta file"
845-
)
836+
subparser_samtools_qc.add_argument("ref_fasta", help="Name of reference fasta file")
846837
subparser_samtools_qc.add_argument("reads1", help="Name of reads file 1")
847838
subparser_samtools_qc.add_argument("reads2", help="Name of reads file 2")
848839
subparser_samtools_qc.add_argument(
@@ -967,19 +958,13 @@ subparser_variant_call_one_sample.add_argument(
967958
metavar="INT",
968959
)
969960
subparser_variant_call_one_sample.add_argument(
970-
"--force",
971-
action="store_true",
972-
help="Overwrite outdir if it already exists",
961+
"--force", action="store_true", help="Overwrite outdir if it already exists"
973962
)
974963
subparser_variant_call_one_sample.add_argument(
975-
"--keep_bam",
976-
action="store_true",
977-
help="Keep BAM file of rmdup reads",
964+
"--keep_bam", action="store_true", help="Keep BAM file of rmdup reads"
978965
)
979966
subparser_variant_call_one_sample.add_argument(
980-
"--debug",
981-
action="store_true",
982-
help="Debug mode: do not clean up any files",
967+
"--debug", action="store_true", help="Debug mode: do not clean up any files"
983968
)
984969
subparser_variant_call_one_sample.add_argument(
985970
"--no_trim",
@@ -990,8 +975,7 @@ subparser_variant_call_one_sample.add_argument(
990975
"ref_dir", help="Directory of reference files, made by clockwork reference_prepare"
991976
)
992977
subparser_variant_call_one_sample.add_argument(
993-
"outdir",
994-
help="Output directory (must not exist, will be created)",
978+
"outdir", help="Output directory (must not exist, will be created)"
995979
)
996980
subparser_variant_call_one_sample.add_argument(
997981
"reads_files",

0 commit comments

Comments
 (0)