Skip to content

Commit b65e7fa

Browse files
committed
🚧 Use tsv-utils for --output-metadata
tsv-join is much faster than the other implementation here (18x faster - 12s vs. 3m43s on the current SARS-CoV-2 GISAID dataset containing 16 million rows).
1 parent adfff1e commit b65e7fa

File tree

3 files changed

+33
-26
lines changed

3 files changed

+33
-26
lines changed

augur/filter/_run.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -419,14 +419,13 @@ def run(args):
419419
write_vcf(args.sequences, args.output_sequences, dropped_samps)
420420
else:
421421
subset_fasta(args.sequences, args.output_sequences, strains_file, args.nthreads)
422-
if not args.output_strains:
423-
os.remove(strains_file)
424422

425423
if args.output_metadata:
426424
print_debug(f"Reading metadata from {args.metadata!r} and writing to {args.output_metadata!r}…")
427-
write_output_metadata(args.metadata, args.metadata_delimiters,
428-
args.metadata_id_columns, args.output_metadata,
429-
valid_strains)
425+
write_output_metadata(args.metadata, metadata_object.id_column, args.output_metadata, strains_file)
426+
427+
if not args.output_strains:
428+
os.remove(strains_file)
430429

431430
# Calculate the number of strains that don't exist in either metadata or
432431
# sequences.

augur/filter/io.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
11
import argparse
2-
import csv
32
from argparse import Namespace
43
import os
54
import re
5+
from shlex import quote as shquote
6+
from shutil import which
67
from textwrap import dedent
7-
from typing import Sequence, Set
8+
from typing import Sequence
89
import numpy as np
910
from collections import defaultdict
10-
from xopen import xopen
1111

1212
from augur.errors import AugurError
1313
from augur.io.file import open_file
14-
from augur.io.metadata import Metadata, METADATA_DATE_COLUMN
14+
from augur.io.metadata import METADATA_DATE_COLUMN
1515
from augur.io.print import print_err
16+
from augur.io.shell_command_runner import run_shell_command
17+
from augur.utils import augur
1618
from .constants import GROUP_BY_GENERATED_COLUMNS
1719
from .include_exclude_rules import extract_variables, parse_filter_query
1820

@@ -96,25 +98,29 @@ def constant_factory(value):
9698
raise AugurError(f"missing or malformed priority scores file {fname}")
9799

98100

99-
def write_output_metadata(input_metadata_path: str, delimiters: Sequence[str],
100-
id_columns: Sequence[str], output_metadata_path: str,
101-
ids_to_write: Set[str]):
101+
def write_output_metadata(input_filename: str, id_column: str, output_filename: str, ids_file: str):
102102
"""
103-
Write output metadata file given input metadata information and a set of IDs
104-
to write.
103+
Write output metadata file given input metadata information and a file
104+
containing ids to write.
105105
"""
106-
input_metadata = Metadata(input_metadata_path, delimiters, id_columns)
107-
108-
with xopen(output_metadata_path, "w", newline="") as output_metadata_handle:
109-
output_metadata = csv.DictWriter(output_metadata_handle, fieldnames=input_metadata.columns,
110-
delimiter="\t", lineterminator=os.linesep)
111-
output_metadata.writeheader()
112-
113-
# Write outputs based on rows in the original metadata.
114-
for row in input_metadata.rows():
115-
row_id = row[input_metadata.id_column]
116-
if row_id in ids_to_write:
117-
output_metadata.writerow(row)
106+
# FIXME: make this a function like augur() and seqkit()
107+
tsv_join = which("tsv-join")
108+
109+
command = f"""
110+
{augur()} read-file {shquote(input_filename)} |
111+
{tsv_join} -H --filter-file {ids_file} --key-fields {id_column} |
112+
{augur()} write-file {shquote(output_filename)}
113+
"""
114+
115+
try:
116+
run_shell_command(command, raise_errors=True)
117+
except Exception:
118+
if os.path.isfile(output_filename):
119+
# Remove the partial output file.
120+
os.remove(output_filename)
121+
raise AugurError(f"Metadata output failed, see error(s) above.")
122+
else:
123+
raise AugurError(f"Metadata output failed, see error(s) above. The command may have already written data to stdout. You may want to clean up any partial outputs.")
118124

119125

120126
# These are the types accepted in the following function.

tests/functional/filter/cram/filter-output-metadata-header.t

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ the default quotechar, any column names with that character may be altered.
77

88
Quoted columns containing the tab delimiter are left unchanged.
99

10+
# FIXME: tsv-join has different behavior here. Test both?
11+
1012
$ cat >metadata.tsv <<~~
1113
> strain "col 1"
1214
> SEQ_1 a

0 commit comments

Comments
 (0)