|
1 | 1 | import io
|
2 | 2 | import logging
|
3 |
| -import re |
4 | 3 | import sys
|
5 | 4 | from datetime import datetime
|
6 | 5 | from typing import Optional
|
@@ -183,10 +182,8 @@ def write_vcf(
|
183 | 182 | reader = retrieval.VariantChunkReader(root)
|
184 | 183 |
|
185 | 184 | if not no_header:
|
186 |
| - original_header = root.attrs.get("vcf_header", None) |
187 | 185 | vcf_header = _generate_header(
|
188 | 186 | root,
|
189 |
| - original_header, |
190 | 187 | sample_ids,
|
191 | 188 | no_version=no_version,
|
192 | 189 | force_ac_an=force_ac_an_header,
|
@@ -429,7 +426,6 @@ def c_chunk_to_vcf(
|
429 | 426 |
|
430 | 427 | def _generate_header(
|
431 | 428 | ds,
|
432 |
| - original_header, |
433 | 429 | sample_ids,
|
434 | 430 | *,
|
435 | 431 | no_version: bool = False,
|
@@ -469,45 +465,12 @@ def _generate_header(
|
469 | 465 | if key in ("genotype", "genotype_phased"):
|
470 | 466 | continue
|
471 | 467 | format_fields.append(key)
|
472 |
| - if original_header is None: # generate entire header |
473 |
| - # [1.4.1 File format] |
474 |
| - print("##fileformat=VCFv4.3", file=output) |
475 | 468 |
|
476 |
| - if "source" in ds.attrs: |
477 |
| - print(f'##source={ds.attrs["source"]}', file=output) |
| 469 | + # [1.4.1 File format] |
| 470 | + print("##fileformat=VCFv4.3", file=output) |
478 | 471 |
|
479 |
| - else: # use original header fields where appropriate |
480 |
| - unstructured_pattern = re.compile("##([^=]+)=([^<].*)") |
481 |
| - structured_pattern = re.compile("##([^=]+)=(<.*)") |
482 |
| - |
483 |
| - for line in original_header.split("\n"): |
484 |
| - if re.fullmatch(unstructured_pattern, line): |
485 |
| - print(line, file=output) |
486 |
| - else: |
487 |
| - match = re.fullmatch(structured_pattern, line) |
488 |
| - if match: |
489 |
| - category = match.group(1) |
490 |
| - id_pattern = re.compile("ID=([^,>]+)") |
491 |
| - key = id_pattern.findall(line)[0] |
492 |
| - if category not in ("contig", "FILTER", "INFO", "FORMAT"): |
493 |
| - # output other structured fields |
494 |
| - print(line, file=output) |
495 |
| - # only output certain categories if in dataset |
496 |
| - elif category == "contig" and key in contigs: |
497 |
| - contigs.remove(key) |
498 |
| - print(line, file=output) |
499 |
| - elif category == "FILTER" and key in filters: |
500 |
| - filters.remove(key) |
501 |
| - print(line, file=output) |
502 |
| - elif category == "INFO" and key in info_fields: |
503 |
| - info_fields.remove(key) |
504 |
| - print(line, file=output) |
505 |
| - elif category == "FORMAT" and key in format_fields: |
506 |
| - format_fields.remove(key) |
507 |
| - print(line, file=output) |
508 |
| - |
509 |
| - # add all fields that are not in the original header |
510 |
| - # or all fields if there was no original header |
| 472 | + if "source" in ds.attrs: |
| 473 | + print(f'##source={ds.attrs["source"]}', file=output) |
511 | 474 |
|
512 | 475 | # [1.4.2 Information field format]
|
513 | 476 | for key in info_fields:
|
|
0 commit comments