88# Required to fetch from NCBI Datasets
99ncbi_taxon_id : " 11234"
1010
11+ ppx_fetch :
12+ seqs : https://lapis.pathoplexus.org/measles/sample/unalignedNucleotideSequences?versionStatus=LATEST_VERSION
13+ meta : https://lapis.pathoplexus.org/measles/sample/details?dataFormat=csv&versionStatus=LATEST_VERSION
14+
1115# The list of NCBI Datasets fields to include from NCBI Datasets output
1216# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
1317# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
@@ -23,6 +27,7 @@ ncbi_datasets_fields:
2327 - release-date
2428 - update-date
2529 - length
30+ - genotype
2631 - host-name
2732 - isolate-lineage-source
2833 - biosample-acc
@@ -60,13 +65,29 @@ curate:
6065 submitter-country : submitter_country
6166 virus-name : virus_name
6267 is_reference : is_reference
68+ ppx_field_map :
69+ sampleCollectionDate : date
70+ displayName : strain
71+ earliestReleaseDate : date_submitted
72+ accessionVersion : PPX_accession
73+ insdcAccessionFull : INSDC_accession
74+ ncbiVirusName : virus_name
75+ geoLocCountry : country
76+ geoLocAdmin1 : division
77+ geoLocAdmin2 : location
78+ genotype : genotype_ppx
79+ hostNameCommon : host
80+ insdcRawReadsAccession : sra_accession
81+ dataUseTermsRestrictedUntil : restrictedUntil
82+ dataUseTermsUrl : dataUseTerms__url
83+ authorAffiliations : institution
6384 # Standardized strain name regex
6485 # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
6586 strain_regex : ' ^.+$'
6687 # Back up strain name field to use if 'strain' doesn't match regex above
6788 strain_backup_fields : []
6889 # List of date fields to standardize to ISO format YYYY-MM-DD
69- date_fields : ['date', 'date_released', 'date_updated ']
90+ date_fields : ['date', 'date_submitted ']
7091 # List of expected date formats that are present in the date fields provided above
7192 # These date formats should use directives expected by datetime
7293 # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
@@ -118,10 +139,28 @@ curate:
118139 ' authors' ,
119140 ' abbr_authors' ,
120141 ' institution' ,
121- ' genotype_ncbi' ,
122142 ' is_reference'
123143 ]
124144 genotype_field : " virus_name"
145+ ppx_metadata_columns : [
146+ ' strain' ,
147+ ' date' ,
148+ ' accession' , # unversioned PPX accession
149+ ' PPX_accession' ,
150+ ' INSDC_accession' ,
151+ ' region' ,
152+ ' country' ,
153+ ' division' ,
154+ ' location' ,
155+ ' host' ,
156+ ' date_submitted' ,
157+ ' sra_accession' ,
158+ ' authors' ,
159+ ' institution' ,
160+ ' dataUseTerms' ,
161+ ' dataUseTerms__url' ,
162+ ' restrictedUntil' ,
163+ ]
125164nextclade :
126165 dataset_name : " nextstrain/measles/N450/WHO-2012"
127166 field_map :
@@ -153,3 +192,70 @@ nextclade:
153192 qc.snpClusters.clusteredSNPs : private_snp_clusters
154193 qc.snpClusters.totalSNPs : private_total_snp_clusters
155194 id_field : " seqName"
195+
196+ ppx_metadata_fields : # Used to create the URL to download PPX metadata
197+ - " accessionVersion"
198+ - " accession"
199+ - " version"
200+ - " submitter"
201+ - " groupName"
202+ - " submittedDate"
203+ - " releasedDate"
204+ - " dataUseTerms"
205+ - " dataUseTermsRestrictedUntil"
206+ - " dataUseTermsUrl"
207+ - " assemblyReferenceGenomeAccession"
208+ - " authorAffiliations"
209+ - " authors"
210+ - " bioprojectAccession"
211+ - " biosampleAccession"
212+ - " completeness"
213+ - " displayName"
214+ - " earliestReleaseDate"
215+ - " frameShifts"
216+ - " geoLocAdmin1"
217+ - " geoLocAdmin2"
218+ - " geoLocCity"
219+ - " geoLocCountry"
220+ - " geoLocLatitude"
221+ - " geoLocLongitude"
222+ - " geoLocSite"
223+ - " hostAge"
224+ - " hostAgeBin"
225+ - " hostDisease"
226+ - " hostGender"
227+ - " hostHealthOutcome"
228+ - " hostHealthState"
229+ - " hostNameCommon"
230+ - " hostOriginCountry"
231+ - " hostVaccinationStatus"
232+ - " insdcAccessionBase"
233+ - " insdcAccessionFull"
234+ - " insdcRawReadsAccession"
235+ - " insdcVersion"
236+ - " isLabHost"
237+ - " length"
238+ - " ncbiReleaseDate"
239+ - " ncbiSourceDb"
240+ - " ncbiSubmitterCountry"
241+ - " ncbiUpdateDate"
242+ - " ncbiVirusName"
243+ - " ncbiVirusTaxId"
244+ - " purposeOfSampling"
245+ - " purposeOfSequencing"
246+ - " qualityControlDetails"
247+ - " qualityControlDetermination"
248+ - " qualityControlIssues"
249+ - " qualityControlMethodName"
250+ - " qualityControlMethodVersion"
251+ - " sampleCollectionDate"
252+ - " sampleCollectionDateRangeLower"
253+ - " sampleCollectionDateRangeUpper"
254+ - " sampleType"
255+ - " totalAmbiguousNucs"
256+ - " totalDeletedNucs"
257+ - " totalFrameShifts"
258+ - " totalInsertedNucs"
259+ - " totalSnps"
260+ - " totalUnknownNucs"
261+ - " travelHistory"
0 commit comments