nextstrain · victorlin · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/ingest/README.md b/ingest/README.md
@@ -1,7 +1,7 @@
 # Ingest
 
-This workflow ingests public data from NCBI and outputs curated metadata and
-sequences that can be used as input for the phylogenetic workflow.
+This workflow ingests public data from Pathoplexus and outputs curated metadata
+and sequences that can be used as input for the phylogenetic workflow.
 
 If you have another data source or private data that needs to be formatted for
 the phylogenetic workflow, then you can use a similar workflow to curate your
@@ -25,18 +25,6 @@ This produces the default outputs of the ingest workflow:
 - metadata      = results/metadata_all.tsv
 - sequences     = results/sequences_all.fasta
 
-### Dumping the full raw metadata from NCBI Datasets
-
-The workflow has a target for dumping the full raw metadata from NCBI Datasets.
-
-```
-nextstrain build ingest dump_ncbi_dataset_report
-```
-
-This will produce the file `ingest/data/ncbi_dataset_report_raw.tsv`,
-which you can inspect to determine what fields and data to use if you want to
-configure the workflow for your pathogen.
-
 ## Defaults
 
 The defaults directory contains all of the default configurations for the ingest workflow.

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -10,6 +10,8 @@ rule all:
     input:
         sequences="results/sequences.fasta",
         metadata="results/metadata.tsv",
+        sequences_open="results/sequences_open.fasta",
+        metadata_open="results/metadata_open.tsv",
 
 # Shared Snakemake files with generic functions are shared across pathogens
 include: "../shared/vendored/snakemake/config.smk"
@@ -18,7 +20,7 @@ include: "../shared/vendored/snakemake/config.smk"
 # If there are build-specific customizations, they should be added with the
 # custom_rules imported below to ensure that the core workflow is not complicated
 # by build-specific rules.
-include: "rules/fetch_from_ncbi.smk"
+include: "rules/fetch.smk"
 include: "rules/curate.smk"
 include: "rules/nextclade.smk"
 
@@ -35,4 +37,4 @@ include: "rules/nextclade.smk"
 if "custom_rules" in config:
     for rule_file in config["custom_rules"]:
 
-        include: rule_file
+        include: rule_file
diff --git a/ingest/defaults/annotations.tsv b/ingest/defaults/annotations.tsv
@@ -272,25 +272,25 @@ ON694341	institution	Centre for Biological Threats, Highly Pathogenic Viruses, R
 ON694342	institution	Centre for Biological Threats, Highly Pathogenic Viruses, Robert Koch Institute, Germany
 ON720848	institution	Microbial Genomics, Hospital General Universitario Gregorio Marañón, Madrid, Spain
 ON720849	institution	Microbial Genomics, Hospital General Universitario Gregorio Marañón, Madrid, Spain
-KT163243	date	1968-XX-XX
-AF260968	date	1951-XX-XX
-AF260968	region	Africa
-AF260968	country	Egypt
-AF260968	host	Homo sapians
-AF196835	host	Phoenicopterus chilensis
-AF196835	date	1999-XX-XX
-AY765264	date	1997-XX-XX
-AY765264	country	Czech Republic
-AY765264	region	Europe
-DQ318020	date	1972-XX-XX
-DQ318020	host	Culex tigripes
-D00246	country	Australia
-D00246	date	1960-XX-XX
-EF631122	date	XXXX-XX-XX
-EF631123	date	XXXX-XX-XX
-DQ116961	date	2004-XX-XX
-AY603654	date	1976-XX-XX
-AM404308	date	1971-XX-XX
-AF260968	date	1951-XX-XX
-AY660002	date	2003-XX-XX
-AY268132	date	2000-XX-XX
+PP_0001F2D	date	1968-XX-XX
+PP_000HJBT	date	1951-XX-XX
+PP_000HJBT	region	Africa
+PP_000HJBT	country	Egypt
+PP_000HJBT	host	Homo sapians
+PP_000HHL9	host	Phoenicopterus chilensis
+PP_000HHL9	date	1999-XX-XX
+PP_000HY01	date	1997-XX-XX
+PP_000HY01	country	Czech Republic
+PP_000HY01	region	Europe
+PP_000JBDU	date	1972-XX-XX
+PP_000JBDU	host	Culex tigripes
+PP_000HZ4S	country	Australia
+PP_000HZ4S	date	1960-XX-XX
+PP_000JSDD	date	XXXX-XX-XX
+PP_000JSEB	date	XXXX-XX-XX
+PP_000J96A	date	2004-XX-XX
+PP_000HXJZ	date	1976-XX-XX
+PP_000HQ6X	date	1971-XX-XX
+PP_000HJBT	date	1951-XX-XX
+PP_000HXRK	date	2003-XX-XX
+PP_000HRSP	date	2000-XX-XX
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -4,70 +4,38 @@
 # Define optional config parameters with their default values here so that users
 # do not have to dig through the workflows to figure out the default values
 
-# Required to fetch from NCBI Datasets
-ncbi_taxon_id: "11082"
-
-# The list of NCBI Datasets fields to include from NCBI Datasets output
-# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
-# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
-# Note: the "accession" field MUST be provided to match with the sequences
-ncbi_datasets_fields:
-  - accession
-  - sourcedb
-  - isolate-lineage
-  - geo-region
-  - geo-location
-  - isolate-collection-date
-  - release-date
-  - update-date
-  - length
-  - host-name
-  - is-lab-host
-  - isolate-lineage-source
-  - bioprojects
-  - biosample-acc
-  - sra-accs
-  - submitter-names
-  - submitter-affiliation
+ppx_fetch:
+  seqs: https://lapis.pathoplexus.org/west-nile/sample/unalignedNucleotideSequences?versionStatus=LATEST_VERSION
+  meta: https://lapis.pathoplexus.org/west-nile/sample/details?dataFormat=csv&versionStatus=LATEST_VERSION
 
 # Config parameters related to the curate pipeline
 curate:
   # The path to the local geolocation rules within the pathogen repo
   # The path should be relative to the ingest directory.
   local_geolocation_rules: "defaults/geolocation-rules.tsv"
-  # The original field names should match the ncbi_datasets_fields provided above.
   # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
   field_map:
-    accession: accession
-    accession_version: accession_version
-    sourcedb: database
-    isolate-lineage: strain
-    geo-region: region
-    geo-location: location
-    isolate-collection-date: date
-    release-date: date_released
-    update-date: date_updated
-    length: length
-    host-name: host
-    is-lab-host: is_lab_host
-    isolate-lineage-source: sample_type
-    biosample-acc: biosample_accessions
-    sra-accs: sra_accessions
-    submitter-names: full_authors
-    submitter-affiliation: institution
-  # Standardized strain name regex
-  # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
-  strain_regex: "^.+$"
-  # Back up strain name field to use if "strain" doesn"t match regex above
-  strain_backup_fields: ["accession"]
+    accessionVersion: PPX_accession
+    insdcAccessionFull: INSDC_accession
+    insdcRawReadsAccession: sra_accession
+    displayName: strain
+    geoLocCountry: country
+    geoLocAdmin1: division
+    geoLocAdmin2: location
+    sampleCollectionDate: date
+    earliestReleaseDate: date_submitted
+    hostNameCommon: host
+    isLabHost: is_lab_host
+    dataUseTermsRestrictedUntil: restrictedUntil
+    dataUseTermsUrl: dataUseTerms__url
+    authors: full_authors
+    authorAffiliations: institution
   # List of date fields to standardize to ISO format YYYY-MM-DD
-  date_fields: ["date", "date_released", "date_updated"]
+  date_fields: ["date", "date_submitted"]
   # List of expected date formats that are present in the date fields provided above
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
   expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"]
-  # The expected field that contains the GenBank geo_loc_name
-  genbank_location_field: location
   titlecase:
     # List of string fields to titlecase
     fields: ["region", "country", "division", "location"]
@@ -93,16 +61,19 @@ curate:
   output_id_field: "accession"
   # The field in the NDJSON record that contains the actual genomic sequence
   output_sequence_field: "sequence"
-  # The field in the NDJSON record that contains the actual GenBank accession
-  genbank_accession: 'accession'
+  # The field in the NDJSON record that contains the actual Pathoplexus accession
+  pathoplexus_accession: 'PPX_accession'
+  # The field in the NDJSON record that contains the actual INSDC accession
+  insdc_accession: 'INSDC_accession'
 
   # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: [
     'accession',
-    #'genbank_accession_rev',
+    'PPX_accession',
+    'PPX_accession__url',
+    'INSDC_accession',
+    'INSDC_accession__url',
     #'strain',
-    #'strain_s',
-    #'viruslineage_ids',
     'date',
     #'updated',
     'region',
@@ -116,15 +87,11 @@ curate:
     'is_lab_host',
     #'date_submitted',
     #'sra_accession',
-    #'full_authors',
-    #'reverse',
     'authors',
-    #'institution',
-    #'title',
-    #'journal',
-    #'publications',
-    #'paper_url',
-    'url',
+    'institution',
+    'dataUseTerms',
+    'dataUseTerms__url',
+    'restrictedUntil',
     'length',
   ]
 
@@ -135,5 +102,72 @@ nextclade:
 
 pathoplexus:
   URL: 'https://lapis.pathoplexus.org/west-nile/sample/details'
-  fields: 'insdcAccessionBase,lineage'
-  accession_field: 'insdcAccessionBase'
+  fields: 'accession,lineage'
+  accession_field: 'accession'
+
+ppx_metadata_fields:
+ - "accessionVersion"
+ - "accession"
+ - "version"
+ - "submitter"
+ - "groupName"
+ - "submittedDate"
+ - "releasedDate"
+ - "dataUseTerms"
+ - "dataUseTermsRestrictedUntil"
+ - "dataUseTermsUrl"
+ - "assemblyReferenceGenomeAccession"
+ - "authorAffiliations"
+ - "authors"
+ - "bioprojectAccession"
+ - "biosampleAccession"
+ - "completeness"
+ - "displayName"
+ - "earliestReleaseDate"
+ - "frameShifts"
+ - "geoLocAdmin1"
+ - "geoLocAdmin2"
+ - "geoLocCity"
+ - "geoLocCountry"
+ - "geoLocLatitude"
+ - "geoLocLongitude"
+ - "geoLocSite"
+ - "hostAge"
+ - "hostAgeBin"
+ - "hostDisease"
+ - "hostGender"
+ - "hostHealthOutcome"
+ - "hostHealthState"
+ - "hostNameCommon"
+ - "hostOriginCountry"
+ - "hostVaccinationStatus"
+ - "insdcAccessionBase"
+ - "insdcAccessionFull"
+ - "insdcRawReadsAccession"
+ - "insdcVersion"
+ - "isLabHost"
+ - "length"
+ - "ncbiReleaseDate"
+ - "ncbiSourceDb"
+ - "ncbiSubmitterCountry"
+ - "ncbiUpdateDate"
+ - "ncbiVirusName"
+ - "ncbiVirusTaxId"
+ - "purposeOfSampling"
+ - "purposeOfSequencing"
+ - "qualityControlDetails"
+ - "qualityControlDetermination"
+ - "qualityControlIssues"
+ - "qualityControlMethodName"
+ - "qualityControlMethodVersion"
+ - "sampleCollectionDate"
+ - "sampleCollectionDateRangeLower"
+ - "sampleCollectionDateRangeUpper"
+ - "sampleType"
+ - "totalAmbiguousNucs"
+ - "totalDeletedNucs"
+ - "totalFrameShifts"
+ - "totalInsertedNucs"
+ - "totalSnps"
+ - "totalUnknownNucs"
+ - "travelHistory"