Skip to content

Commit 3d4e9a1

Browse files
committed
Don't use original VCF header in vcz
1 parent cfa5e4e commit 3d4e9a1

File tree

1 file changed

+6
-43
lines changed

1 file changed

+6
-43
lines changed

vcztools/vcf_writer.py

Lines changed: 6 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import io
22
import logging
3-
import re
43
import sys
54
from datetime import datetime
65
from typing import Optional
@@ -145,11 +144,9 @@ def write_vcf(
145144
)
146145

147146
if not no_header:
148-
original_header = root.attrs.get("vcf_header", None)
149147
force_ac_an_header = not drop_genotypes and samples_selection is not None
150148
vcf_header = _generate_header(
151149
root,
152-
original_header,
153150
sample_ids,
154151
no_version=no_version,
155152
force_ac_an=force_ac_an_header,
@@ -300,7 +297,6 @@ def c_chunk_to_vcf(
300297

301298
def _generate_header(
302299
ds,
303-
original_header,
304300
sample_ids,
305301
*,
306302
no_version: bool = False,
@@ -340,45 +336,12 @@ def _generate_header(
340336
if key in ("genotype", "genotype_phased"):
341337
continue
342338
format_fields.append(key)
343-
if original_header is None: # generate entire header
344-
# [1.4.1 File format]
345-
print("##fileformat=VCFv4.3", file=output)
346-
347-
if "source" in ds.attrs:
348-
print(f'##source={ds.attrs["source"]}', file=output)
349-
350-
else: # use original header fields where appropriate
351-
unstructured_pattern = re.compile("##([^=]+)=([^<].*)")
352-
structured_pattern = re.compile("##([^=]+)=(<.*)")
353-
354-
for line in original_header.split("\n"):
355-
if re.fullmatch(unstructured_pattern, line):
356-
print(line, file=output)
357-
else:
358-
match = re.fullmatch(structured_pattern, line)
359-
if match:
360-
category = match.group(1)
361-
id_pattern = re.compile("ID=([^,>]+)")
362-
key = id_pattern.findall(line)[0]
363-
if category not in ("contig", "FILTER", "INFO", "FORMAT"):
364-
# output other structured fields
365-
print(line, file=output)
366-
# only output certain categories if in dataset
367-
elif category == "contig" and key in contigs:
368-
contigs.remove(key)
369-
print(line, file=output)
370-
elif category == "FILTER" and key in filters:
371-
filters.remove(key)
372-
print(line, file=output)
373-
elif category == "INFO" and key in info_fields:
374-
info_fields.remove(key)
375-
print(line, file=output)
376-
elif category == "FORMAT" and key in format_fields:
377-
format_fields.remove(key)
378-
print(line, file=output)
379-
380-
# add all fields that are not in the original header
381-
# or all fields if there was no original header
339+
340+
# [1.4.1 File format]
341+
print("##fileformat=VCFv4.3", file=output)
342+
343+
if "source" in ds.attrs:
344+
print(f'##source={ds.attrs["source"]}', file=output)
382345

383346
# [1.4.2 Information field format]
384347
for key in info_fields:

0 commit comments

Comments
 (0)