WIP strain name matching

jameshadfield · jameshadfield · commit fcda4b105546 · 2025-12-09T13:26:20.000+13:00
diff --git a/ingest/build-configs/strain-name-matching/README.md b/ingest/build-configs/strain-name-matching/README.md
@@ -0,0 +1,35 @@
+# Strain name matching
+
+With our move away from fauna and to a curated all-influenza ingest pipeline we have introduced a number of strain name changes. This immediately presents a problem as we have myriad lists of hardcoded strain names, such as outliers-to-drop and force-include-lists. This workflow (which is rather ad-hoc!) attempts to match up the old (i.e. fauna) strain names with their updated strain names.
+
+We use a combination of fuzzy-matching and, where possible a hardcoded map of fauna strain names to new strain names.
+
+## How to run
+
+```
+cd ingest
+snakemake --cores 4 --snakefile build-configs/strain-name-matching/Snakefile -pf -n
+```
+
+## What files you'll need
+
+* The original fauna metadata files e.g. (from the base directory) `aws s3 cp s3://nextstrain-data-private/files/workflows/seasonal-flu/h1n1pdm/metadata.tsv.xz - | xz -c -d > data/h1n1pdm/metadata.tsv`. Note that this file won't represent fauna data forever!
+
+* A number of files from the normal ingest pipeline. `snakemake --cores 2 --config gisaid_pairs='["gisaid_cache"]' -pf data/curated_gisaid.ndjson data/avian-flu/curated_gisaid.ndjson results/h3n2/metadata.tsv results/h1n1pdm/metadata.tsv results/vic/metadata.tsv results/yam/metadata.tsv`
+
+
+## Hardcoded strain maps
+
+For seasonal-flu datasets we can query the EPI_ISLs of the fauna data against the curated data to create lookups. 
+This table reports how many of the fauna strain names have matches in our data. For avian-flu it's a little tricker so we leverage the existing diff-avian-flu script to create lookups.
+
+
+| Dataset  | Updated | Missing | Unchanged |
+| -------- | ------- | -- | -- |
+| H1N1pdm  | 1,729    | 1,659 | 149,143
+| H3N2  | 2,125    | 2,674 | 177,009
+| vic  | 1,338    | 286 | 66,283
+| yam  | 367    | 10 | 21,946
+| avian-flu | 34,426 | 1,531 | 26,754
+
+
diff --git a/ingest/build-configs/strain-name-matching/Snakefile b/ingest/build-configs/strain-name-matching/Snakefile
@@ -0,0 +1,83 @@
+include: "../../rules/remote_files.smk"
+
+import os
+configfile: os.path.join(workflow.basedir, "config.yaml")
+
+print(f"{config=}")
+
+rule all:
+    input:
+        files = [f"results/strain-name-matching/{d}/{f}" for d in config['strain-lists'] for f in config['strain-lists'][d]]
+
+
+rule metadata_strains:
+    input:
+        "results/{dataset}/metadata.tsv",
+    output:
+        "results/strain-name-matching/{dataset}/metadata.txt"
+    run:
+        from augur.io import read_metadata
+        m = read_metadata(input[0])
+        with open(output[0], 'w') as fh:
+            print("\n".join(m.index.tolist()), file=fh)
+
+rule compute_strain_maps_via_epi_isl_lookup:
+    """
+    Computes maps of (old) fauna strain names to (new) curated strain names, where possible.
+    Fauna inputs via (e.g.) "aws s3 cp s3://nextstrain-data-private/files/workflows/seasonal-flu/vic/metadata.tsv
+.xz - | xz -c -d > data/vic/metadata.tsv"
+    """
+    input:
+        fauna=lambda w: config['strain-maps'][w.dataset],
+        curated="data/curated_gisaid.ndjson",   # hardcoded
+    output:
+        tsv = "results/strain-name-matching/{dataset}/strain-name-map.tsv"
+    wildcard_constraints:
+        dataset= "h3n2|h1n1pdm|vic|yam"
+    shell:
+        """
+        ./scripts/match-strain-names.py \
+            --original-metadata {input.fauna} \
+            --new-metadata {input.curated} \
+            --changed {output.tsv}
+        """
+
+rule compute_strain_maps_via_avian_flu_diff:
+    """
+    Computes maps of (old) fauna strain names to (new) curated strain names, where possible.
+    """
+    input:
+        fauna=lambda w: config['strain-maps'][w.dataset],
+        curated="data/avian-flu/curated_gisaid.ndjson",   # hardcoded
+    output:
+        tsv = "results/strain-name-matching/{dataset}/strain-name-map.tsv"
+    wildcard_constraints:
+        dataset= "avian-flu"
+    shell:
+        """
+        ./scripts/diff-avian-flu.py \
+            --truth {input.fauna} \
+            --query {input.curated} \
+            --output-strain-map {output.tsv}
+        """
+
+def strain_map(wildcards):
+    if config['strain-maps'].get(wildcards.dataset, False):
+        return "results/strain-name-matching/{dataset}/strain-name-map.tsv"
+    return []   
+
+rule match_strains:
+    input:
+        metadata="results/strain-name-matching/{dataset}/metadata.txt",
+        strains=lambda w: path_or_url(config['strain-lists'][w.dataset][w.file_type]),
+        strain_map_tsv=strain_map,
+    output:
+        strains="results/strain-name-matching/{dataset}/{file_type}",
+    shell:
+        """
+        ./scripts/strain-name-fuzzer.py \
+            --curated-strains {input.metadata:q} \
+            --query-strains {input.strains:q} \
+            --strain-map {input.strain_map_tsv:q} \
+            > {output.strains:q}
+        """
diff --git a/ingest/build-configs/strain-name-matching/config.yaml b/ingest/build-configs/strain-name-matching/config.yaml
@@ -0,0 +1,42 @@
+
+strain-maps:
+    # The value here points to the FAUNA data
+    h1n1pdm: "../data/{dataset}/metadata.tsv"
+    h3n2: "../data/{dataset}/metadata.tsv"
+    vic: "../data/{dataset}/metadata.tsv"
+    yam: "../data/{dataset}/metadata.tsv"
+    avian-flu: "targets/avian-flu-fauna-ha.tsv"
+
+
+strain-lists:
+    h3n2:
+        "outliers.txt": "../config/h3n2/outliers.txt"
+        "reference_strains.txt": "../config/h3n2/reference_strains.txt"
+        "ha_prioritised_seqs_file.tsv": "../config/h3n2/ha/prioritized_seqs_file.tsv"
+
+    vic:
+        "outliers.txt": "../config/vic/outliers.txt"
+        "reference_strains.txt": "../config/vic/reference_strains.txt"
+
+    yam:
+        "outliers.txt": "../config/yam/outliers.txt"
+        "reference_strains.txt": "../config/yam/reference_strains.txt"
+
+    h1n1pdm:
+        "outliers.txt": "../config/h1n1pdm/outliers.txt"
+        "reference_strains.txt": "../config/h1n1pdm/reference_strains.txt"
+
+    avian-flu:
+        h5n1_dropped_strains_h5n1.txt: https://raw.githubusercontent.com/nextstrain/avian-flu/refs/heads/master/config/h5n1/dropped_strains_h5n1.txt
+        h5n1_include_strains_h5n1_2y.txt: https://raw.githubusercontent.com/nextstrain/avian-flu/refs/heads/master/config/h5n1/include_strains_h5n1_2y.txt
+        h5n1_include_strains_h5n1_all-time.txt: https://raw.githubusercontent.com/nextstrain/avian-flu/refs/heads/master/config/h5n1/include_strains_h5n1_all-time.txt
+
+        h5nx_dropped_strains_h5nx.txt: https://raw.githubusercontent.com/nextstrain/avian-flu/refs/heads/master/config/h5nx/dropped_strains_h5nx.txt
+        h5nx_include_strains_h5nx_2y.txt: https://raw.githubusercontent.com/nextstrain/avian-flu/refs/heads/master/config/h5nx/include_strains_h5nx_2y.txt
+        h5nx_include_strains_h5nx_all-time.txt: https://raw.githubusercontent.com/nextstrain/avian-flu/refs/heads/master/config/h5nx/include_strains_h5nx_all-time.txt
+
+        h7n9_dropped_strains_h7n9.txt: https://raw.githubusercontent.com/nextstrain/avian-flu/refs/heads/master/config/h7n9/dropped_strains_h7n9.txt
+        h7n9_include_strains_h7n9_all-time.txt: https://raw.githubusercontent.com/nextstrain/avian-flu/refs/heads/master/config/h7n9/include_strains_h7n9_all-time.txt
+
+        h9n2_dropped_strains_h9n2.txt: https://raw.githubusercontent.com/nextstrain/avian-flu/refs/heads/master/config/h9n2/dropped_strains_h9n2.txt
+        h9n2_include_strains_h9n2_all-time.txt: https://raw.githubusercontent.com/nextstrain/avian-flu/refs/heads/master/config/h9n2/include_strains_h9n2_all-time.txt
diff --git a/ingest/scripts/diff-avian-flu.py b/ingest/scripts/diff-avian-flu.py
@@ -435,6 +435,7 @@ def compare_records(truth_records, query_records):
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--truth", required=True, metavar="FILE", help="Source of truth file (TSV)")
     parser.add_argument("--query", required=True, metavar="FILE", help="Query file (NDJSON )")
+    parser.add_argument("--output-strain-map", required=False, metavar="FILE", help="Output a map of strain names (FAUNA -> NEW)")
 
     args = parser.parse_args()
 
@@ -454,6 +455,16 @@ def compare_records(truth_records, query_records):
     remap_via_simplified_strain(missing_keys, added_keys, truth_records, query_records)
     (truth_keys, query_keys, missing_keys, added_keys, common_keys) = compare_records(truth_records, query_records)
 
+    if args.output_strain_map:
+        print(f"\nWriting out strain map of these {len(common_keys)} matches if the keys differ")
+        with open(args.output_strain_map, 'w') as fh:
+            for key in common_keys:
+                truth_strain = truth_records[key]['strain'] # FAUNA
+                query_strain = query_records[key]['strain'] # CURATED NDJSON
+                if truth_strain!=query_strain:
+                    print(f"{truth_strain}\t{query_strain}", file=fh)
+        print(f"DONE.")
+
 
     print()
     # Allow missing "truth" keys IF --extra-truth, otherwise report the rows
diff --git a/ingest/scripts/match-strain-names.py b/ingest/scripts/match-strain-names.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""
+Attempts to link strain names via a common ID
+"""
+import argparse
+import sys
+from augur.io import read_metadata
+from augur.io.json import load_ndjson
+
+type EpiIsl = str
+type StrainName = str
+
+def parse_existing_metadata(fname: str) -> dict[EpiIsl, StrainName]:
+    df = read_metadata(fname)
+    return dict(zip(df['gisaid_epi_isl'], df.index))
+
+def parse_ndjson(fname: str) -> dict[EpiIsl, StrainName]:
+    with open(fname) as fh:
+        records = load_ndjson(fh)
+        return {record['gisaid_epi_isl']:record['strain'] for record in records}
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--original-metadata", required=True, help="Original (fauna) metadata TSV with 'strain' and 'gisaid_epi_isl' columns")
+    parser.add_argument("--new-metadata", required=True, help="Curated NDJSON")
+    parser.add_argument("--changed", required=True, help="TSV of strain names which have changed. Map is FAUNA -> NEW")
+    args = parser.parse_args()
+
+    fauna = parse_existing_metadata(args.original_metadata)
+    curated = parse_ndjson(args.new_metadata)
+    changed_fh = open(args.changed, 'w')
+    print("FAUNA_STRAIN\tCURATED_STRAIN", file=changed_fh)
+
+    [unchanged, changed, missing] = [0,0,0]
+    for epi_isl, fauna_name in fauna.items():
+        new_name = curated.get(epi_isl, None)
+        if new_name is None:
+            missing+=1
+            continue
+        if fauna_name == new_name:
+            unchanged+=1
+        else:
+            changed+=1
+            print(f"{fauna_name}\t{new_name}", file=changed_fh)
+
+    print(f"Using {args.original_metadata} as source of truth...", file=sys.stderr)
+    print(f"{missing=:,}")
+    print(f"{changed=:,}")
+    print(f"{unchanged=:,}")
+    changed_fh.close()
diff --git a/ingest/scripts/strain-name-fuzzer.py b/ingest/scripts/strain-name-fuzzer.py