ingest: add ppx ingest

rneher · rneher · commit 056f021971fa · 2025-12-22T11:52:24.000+01:00
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -25,6 +25,8 @@ rule all:
     input:
         "results/sequences.fasta",
         "results/metadata.tsv",
+        "results/sequences_open.fasta",
+        "results/metadata_open.tsv",
 
 
 # Note that only PATHOGEN level customizations should be added to these
@@ -33,6 +35,7 @@ rule all:
 # custom_rules imported below to ensure that the core workflow is not complicated
 # by build specific rules.
 include: "rules/fetch_from_ncbi.smk"
+include: "rules/fetch_from_ppx.smk"
 include: "rules/curate.smk"
 include: "rules/nextclade.smk"
 
diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml
@@ -16,8 +16,9 @@ s3_dst: "s3://nextstrain-data/files/workflows/measles"
 
 # Mapping of files to upload
 files_to_upload:
-  ncbi.ndjson.zst: data/ncbi.ndjson
-  metadata.tsv.zst: results/metadata.tsv
+  metadata.tsv.zst: results/metadata_with_restricted.tsv
+  sequences.fasta.zst: results/sequences_with_restricted.fasta
+  metadata.tsv.zst: results/metadata_with_restricted.tsv
   sequences.fasta.zst: results/sequences.fasta
   nextclade.tsv.zst: results/nextclade.tsv
   alignment.fasta.zst: results/alignment.fasta
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -8,6 +8,10 @@
 # Required to fetch from NCBI Datasets
 ncbi_taxon_id: "11234"
 
+ppx_fetch:
+  seqs: https://lapis.pathoplexus.org/measles/sample/unalignedNucleotideSequences?versionStatus=LATEST_VERSION
+  meta: https://lapis.pathoplexus.org/measles/sample/details?dataFormat=csv&versionStatus=LATEST_VERSION
+
 # The list of NCBI Datasets fields to include from NCBI Datasets output
 # These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
 # https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
@@ -23,6 +27,7 @@ ncbi_datasets_fields:
   - release-date
   - update-date
   - length
+  - genotype
   - host-name
   - isolate-lineage-source
   - biosample-acc
@@ -60,13 +65,29 @@ curate:
     submitter-country: submitter_country
     virus-name: virus_name
     is_reference: is_reference
+  ppx_field_map:
+    sampleCollectionDate: date
+    displayName: strain
+    earliestReleaseDate: date_submitted
+    accessionVersion: PPX_accession
+    insdcAccessionFull: INSDC_accession
+    ncbiVirusName: virus_name
+    geoLocCountry: country
+    geoLocAdmin1: division
+    geoLocAdmin2: location
+    genotype: genotype_ppx
+    hostNameCommon: host
+    insdcRawReadsAccession: sra_accession
+    dataUseTermsRestrictedUntil: restrictedUntil
+    dataUseTermsUrl: dataUseTerms__url
+    authorAffiliations: institution
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
   strain_regex: '^.+$'
   # Back up strain name field to use if 'strain' doesn't match regex above
   strain_backup_fields: []
   # List of date fields to standardize to ISO format YYYY-MM-DD
-  date_fields: ['date', 'date_released', 'date_updated']
+  date_fields: ['date', 'date_submitted']
   # List of expected date formats that are present in the date fields provided above
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
@@ -118,10 +139,28 @@ curate:
     'authors',
     'abbr_authors',
     'institution',
-    'genotype_ncbi',
     'is_reference'
   ]
   genotype_field: "virus_name"
+  ppx_metadata_columns: [
+    'strain',
+    'date',
+    'accession', # unversioned PPX accession
+    'PPX_accession',
+    'INSDC_accession',
+    'region',
+    'country',
+    'division',
+    'location',
+    'host',
+    'date_submitted',
+    'sra_accession',
+    'authors',
+    'institution',
+    'dataUseTerms',
+    'dataUseTerms__url',
+    'restrictedUntil',
+  ]
 nextclade:
   dataset_name: "nextstrain/measles/N450/WHO-2012"
   field_map:
@@ -153,3 +192,70 @@ nextclade:
     qc.snpClusters.clusteredSNPs: private_snp_clusters
     qc.snpClusters.totalSNPs: private_total_snp_clusters
   id_field: "seqName"
+
+ppx_metadata_fields: # Used to create the URL to download PPX metadata
+ - "accessionVersion"
+ - "accession"
+ - "version"
+ - "submitter"
+ - "groupName"
+ - "submittedDate"
+ - "releasedDate"
+ - "dataUseTerms"
+ - "dataUseTermsRestrictedUntil"
+ - "dataUseTermsUrl"
+ - "assemblyReferenceGenomeAccession"
+ - "authorAffiliations"
+ - "authors"
+ - "bioprojectAccession"
+ - "biosampleAccession"
+ - "completeness"
+ - "displayName"
+ - "earliestReleaseDate"
+ - "frameShifts"
+ - "geoLocAdmin1"
+ - "geoLocAdmin2"
+ - "geoLocCity"
+ - "geoLocCountry"
+ - "geoLocLatitude"
+ - "geoLocLongitude"
+ - "geoLocSite"
+ - "hostAge"
+ - "hostAgeBin"
+ - "hostDisease"
+ - "hostGender"
+ - "hostHealthOutcome"
+ - "hostHealthState"
+ - "hostNameCommon"
+ - "hostOriginCountry"
+ - "hostVaccinationStatus"
+ - "insdcAccessionBase"
+ - "insdcAccessionFull"
+ - "insdcRawReadsAccession"
+ - "insdcVersion"
+ - "isLabHost"
+ - "length"
+ - "ncbiReleaseDate"
+ - "ncbiSourceDb"
+ - "ncbiSubmitterCountry"
+ - "ncbiUpdateDate"
+ - "ncbiVirusName"
+ - "ncbiVirusTaxId"
+ - "purposeOfSampling"
+ - "purposeOfSequencing"
+ - "qualityControlDetails"
+ - "qualityControlDetermination"
+ - "qualityControlIssues"
+ - "qualityControlMethodName"
+ - "qualityControlMethodVersion"
+ - "sampleCollectionDate"
+ - "sampleCollectionDateRangeLower"
+ - "sampleCollectionDateRangeUpper"
+ - "sampleType"
+ - "totalAmbiguousNucs"
+ - "totalDeletedNucs"
+ - "totalFrameShifts"
+ - "totalInsertedNucs"
+ - "totalSnps"
+ - "totalUnknownNucs"
+ - "travelHistory"
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -31,7 +31,7 @@ def format_field_map(field_map: dict[str, str]) -> list[str]:
 # separate files: a metadata TSV and a sequences FASTA.
 rule curate:
     input:
-        sequences_ndjson="data/ncbi.ndjson",
+        sequences_ndjson="data/ppx.ndjson",
         geolocation_rules=resolve_config_path(config["curate"]["local_geolocation_rules"]),
         annotations=resolve_config_path(config["curate"]["annotations"]),
     output:
@@ -42,7 +42,7 @@ rule curate:
     benchmark:
         "benchmarks/curate.txt"
     params:
-        field_map=format_field_map(config["curate"]["field_map"]),
+        field_map=format_field_map(config["curate"]["ppx_field_map"]),
         strain_regex=config["curate"]["strain_regex"],
         strain_backup_fields=config["curate"]["strain_backup_fields"],
         date_fields=config["curate"]["date_fields"],
@@ -72,8 +72,6 @@ rule curate:
             | augur curate format-dates \
                 --date-fields {params.date_fields:q} \
                 --expected-date-formats {params.expected_date_formats:q} \
-            | augur curate parse-genbank-location \
-                --location-field {params.genbank_location_field:q} \
             | augur curate titlecase \
                 --titlecase-fields {params.titlecase_fields:q} \
                 --articles {params.articles:q} \
@@ -84,8 +82,6 @@ rule curate:
                 --abbr-authors-field {params.abbr_authors_field:q} \
             | augur curate apply-geolocation-rules \
                 --geolocation-rules {input.geolocation_rules:q} \
-            | {workflow.basedir}/bin/parse-measles-genotype-names.py \
-                --genotype-field {params.genotype_field:q} \
             | augur curate apply-record-annotations \
                 --annotations {input.annotations:q} \
                 --id-field {params.annotations_id:q} \
@@ -102,7 +98,7 @@ rule subset_metadata:
     output:
         subset_metadata="data/subset_metadata.tsv",
     params:
-        metadata_fields=",".join(config["curate"]["metadata_columns"]),
+        metadata_fields=",".join(config["curate"]["ppx_metadata_columns"]),
     shell:
         """
         csvtk cut -t -f {params.metadata_fields} \
diff --git a/ingest/rules/fetch_from_ppx.smk b/ingest/rules/fetch_from_ppx.smk
@@ -0,0 +1,44 @@
+rule download_ppx_seqs:
+    output:
+        sequences= "data/ppx_sequences.fasta",
+    params:
+        sequences_url=lambda w: config["ppx_fetch"]["seqs"],
+    # Allow retries in case of network errors
+    retries: 5
+    shell:
+        """
+        curl -fsSL {params.sequences_url:q} -o {output.sequences}
+        """
+
+rule download_ppx_meta:
+    output:
+        metadata= "data/ppx_metadata.csv"
+    params:
+        metadata_url=lambda w: config["ppx_fetch"]["meta"],
+        fields = ",".join(config["ppx_metadata_fields"])
+    # Allow retries in case of network errors
+    retries: 5
+    shell:
+        """
+        curl -fsSL '{params.metadata_url}&fields={params.fields}' -o {output.metadata}
+        """
+
+rule format_ppx_ndjson:
+    input:
+        sequences="data/ppx_sequences.fasta",
+        metadata="data/ppx_metadata.csv"
+    output:
+        ndjson="data/ppx.ndjson"
+    log:
+        "logs/format_ppx_ndjson.txt"
+    shell:
+        """
+        augur curate passthru \
+            --metadata {input.metadata} \
+            --fasta {input.sequences} \
+            --seq-id-column accessionVersion \
+            --seq-field sequence \
+            --unmatched-reporting warn \
+            --duplicate-reporting warn \
+            2> logs/format_ppx_ndjson.txt > {output.ndjson}
+        """
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -124,3 +124,20 @@ rule join_metadata_and_nextclade:
             --output-metadata {output.metadata:q} \
             --no-source-columns
         """
+
+rule extract_open_data:
+    input:
+        metadata = "results/metadata.tsv",
+        sequences = "results/sequences.fasta"
+    output:
+        metadata = "results/metadata_open.tsv",
+        sequences = "results/sequences_open.fasta"
+    shell:
+        """
+        augur filter --metadata {input.metadata} \
+                     --sequences {input.sequences} \
+                     --metadata-id-columns accession \
+                     --exclude-where "dataUseTerms=RESTRICTED" \
+                     --output-metadata {output.metadata} \
+                     --output-sequences {output.sequences}
+        """