Skip to content

Commit 056f021

Browse files
committed
ingest: add ppx ingest
1 parent 1090958 commit 056f021

File tree

6 files changed

+178
-11
lines changed

6 files changed

+178
-11
lines changed

ingest/Snakefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ rule all:
2525
input:
2626
"results/sequences.fasta",
2727
"results/metadata.tsv",
28+
"results/sequences_open.fasta",
29+
"results/metadata_open.tsv",
2830

2931

3032
# Note that only PATHOGEN level customizations should be added to these
@@ -33,6 +35,7 @@ rule all:
3335
# custom_rules imported below to ensure that the core workflow is not complicated
3436
# by build specific rules.
3537
include: "rules/fetch_from_ncbi.smk"
38+
include: "rules/fetch_from_ppx.smk"
3639
include: "rules/curate.smk"
3740
include: "rules/nextclade.smk"
3841

ingest/build-configs/nextstrain-automation/config.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ s3_dst: "s3://nextstrain-data/files/workflows/measles"
1616

1717
# Mapping of files to upload
1818
files_to_upload:
19-
ncbi.ndjson.zst: data/ncbi.ndjson
20-
metadata.tsv.zst: results/metadata.tsv
19+
metadata.tsv.zst: results/metadata_with_restricted.tsv
20+
sequences.fasta.zst: results/sequences_with_restricted.fasta
21+
metadata.tsv.zst: results/metadata_with_restricted.tsv
2122
sequences.fasta.zst: results/sequences.fasta
2223
nextclade.tsv.zst: results/nextclade.tsv
2324
alignment.fasta.zst: results/alignment.fasta

ingest/defaults/config.yaml

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
# Required to fetch from NCBI Datasets
99
ncbi_taxon_id: "11234"
1010

11+
ppx_fetch:
12+
seqs: https://lapis.pathoplexus.org/measles/sample/unalignedNucleotideSequences?versionStatus=LATEST_VERSION
13+
meta: https://lapis.pathoplexus.org/measles/sample/details?dataFormat=csv&versionStatus=LATEST_VERSION
14+
1115
# The list of NCBI Datasets fields to include from NCBI Datasets output
1216
# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
1317
# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
@@ -23,6 +27,7 @@ ncbi_datasets_fields:
2327
- release-date
2428
- update-date
2529
- length
30+
- genotype
2631
- host-name
2732
- isolate-lineage-source
2833
- biosample-acc
@@ -60,13 +65,29 @@ curate:
6065
submitter-country: submitter_country
6166
virus-name: virus_name
6267
is_reference: is_reference
68+
ppx_field_map:
69+
sampleCollectionDate: date
70+
displayName: strain
71+
earliestReleaseDate: date_submitted
72+
accessionVersion: PPX_accession
73+
insdcAccessionFull: INSDC_accession
74+
ncbiVirusName: virus_name
75+
geoLocCountry: country
76+
geoLocAdmin1: division
77+
geoLocAdmin2: location
78+
genotype: genotype_ppx
79+
hostNameCommon: host
80+
insdcRawReadsAccession: sra_accession
81+
dataUseTermsRestrictedUntil: restrictedUntil
82+
dataUseTermsUrl: dataUseTerms__url
83+
authorAffiliations: institution
6384
# Standardized strain name regex
6485
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
6586
strain_regex: '^.+$'
6687
# Back up strain name field to use if 'strain' doesn't match regex above
6788
strain_backup_fields: []
6889
# List of date fields to standardize to ISO format YYYY-MM-DD
69-
date_fields: ['date', 'date_released', 'date_updated']
90+
date_fields: ['date', 'date_submitted']
7091
# List of expected date formats that are present in the date fields provided above
7192
# These date formats should use directives expected by datetime
7293
# See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
@@ -118,10 +139,28 @@ curate:
118139
'authors',
119140
'abbr_authors',
120141
'institution',
121-
'genotype_ncbi',
122142
'is_reference'
123143
]
124144
genotype_field: "virus_name"
145+
ppx_metadata_columns: [
146+
'strain',
147+
'date',
148+
'accession', # unversioned PPX accession
149+
'PPX_accession',
150+
'INSDC_accession',
151+
'region',
152+
'country',
153+
'division',
154+
'location',
155+
'host',
156+
'date_submitted',
157+
'sra_accession',
158+
'authors',
159+
'institution',
160+
'dataUseTerms',
161+
'dataUseTerms__url',
162+
'restrictedUntil',
163+
]
125164
nextclade:
126165
dataset_name: "nextstrain/measles/N450/WHO-2012"
127166
field_map:
@@ -153,3 +192,70 @@ nextclade:
153192
qc.snpClusters.clusteredSNPs: private_snp_clusters
154193
qc.snpClusters.totalSNPs: private_total_snp_clusters
155194
id_field: "seqName"
195+
196+
ppx_metadata_fields: # Used to create the URL to download PPX metadata
197+
- "accessionVersion"
198+
- "accession"
199+
- "version"
200+
- "submitter"
201+
- "groupName"
202+
- "submittedDate"
203+
- "releasedDate"
204+
- "dataUseTerms"
205+
- "dataUseTermsRestrictedUntil"
206+
- "dataUseTermsUrl"
207+
- "assemblyReferenceGenomeAccession"
208+
- "authorAffiliations"
209+
- "authors"
210+
- "bioprojectAccession"
211+
- "biosampleAccession"
212+
- "completeness"
213+
- "displayName"
214+
- "earliestReleaseDate"
215+
- "frameShifts"
216+
- "geoLocAdmin1"
217+
- "geoLocAdmin2"
218+
- "geoLocCity"
219+
- "geoLocCountry"
220+
- "geoLocLatitude"
221+
- "geoLocLongitude"
222+
- "geoLocSite"
223+
- "hostAge"
224+
- "hostAgeBin"
225+
- "hostDisease"
226+
- "hostGender"
227+
- "hostHealthOutcome"
228+
- "hostHealthState"
229+
- "hostNameCommon"
230+
- "hostOriginCountry"
231+
- "hostVaccinationStatus"
232+
- "insdcAccessionBase"
233+
- "insdcAccessionFull"
234+
- "insdcRawReadsAccession"
235+
- "insdcVersion"
236+
- "isLabHost"
237+
- "length"
238+
- "ncbiReleaseDate"
239+
- "ncbiSourceDb"
240+
- "ncbiSubmitterCountry"
241+
- "ncbiUpdateDate"
242+
- "ncbiVirusName"
243+
- "ncbiVirusTaxId"
244+
- "purposeOfSampling"
245+
- "purposeOfSequencing"
246+
- "qualityControlDetails"
247+
- "qualityControlDetermination"
248+
- "qualityControlIssues"
249+
- "qualityControlMethodName"
250+
- "qualityControlMethodVersion"
251+
- "sampleCollectionDate"
252+
- "sampleCollectionDateRangeLower"
253+
- "sampleCollectionDateRangeUpper"
254+
- "sampleType"
255+
- "totalAmbiguousNucs"
256+
- "totalDeletedNucs"
257+
- "totalFrameShifts"
258+
- "totalInsertedNucs"
259+
- "totalSnps"
260+
- "totalUnknownNucs"
261+
- "travelHistory"

ingest/rules/curate.smk

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def format_field_map(field_map: dict[str, str]) -> list[str]:
3131
# separate files: a metadata TSV and a sequences FASTA.
3232
rule curate:
3333
input:
34-
sequences_ndjson="data/ncbi.ndjson",
34+
sequences_ndjson="data/ppx.ndjson",
3535
geolocation_rules=resolve_config_path(config["curate"]["local_geolocation_rules"]),
3636
annotations=resolve_config_path(config["curate"]["annotations"]),
3737
output:
@@ -42,7 +42,7 @@ rule curate:
4242
benchmark:
4343
"benchmarks/curate.txt"
4444
params:
45-
field_map=format_field_map(config["curate"]["field_map"]),
45+
field_map=format_field_map(config["curate"]["ppx_field_map"]),
4646
strain_regex=config["curate"]["strain_regex"],
4747
strain_backup_fields=config["curate"]["strain_backup_fields"],
4848
date_fields=config["curate"]["date_fields"],
@@ -72,8 +72,6 @@ rule curate:
7272
| augur curate format-dates \
7373
--date-fields {params.date_fields:q} \
7474
--expected-date-formats {params.expected_date_formats:q} \
75-
| augur curate parse-genbank-location \
76-
--location-field {params.genbank_location_field:q} \
7775
| augur curate titlecase \
7876
--titlecase-fields {params.titlecase_fields:q} \
7977
--articles {params.articles:q} \
@@ -84,8 +82,6 @@ rule curate:
8482
--abbr-authors-field {params.abbr_authors_field:q} \
8583
| augur curate apply-geolocation-rules \
8684
--geolocation-rules {input.geolocation_rules:q} \
87-
| {workflow.basedir}/bin/parse-measles-genotype-names.py \
88-
--genotype-field {params.genotype_field:q} \
8985
| augur curate apply-record-annotations \
9086
--annotations {input.annotations:q} \
9187
--id-field {params.annotations_id:q} \
@@ -102,7 +98,7 @@ rule subset_metadata:
10298
output:
10399
subset_metadata="data/subset_metadata.tsv",
104100
params:
105-
metadata_fields=",".join(config["curate"]["metadata_columns"]),
101+
metadata_fields=",".join(config["curate"]["ppx_metadata_columns"]),
106102
shell:
107103
"""
108104
csvtk cut -t -f {params.metadata_fields} \

ingest/rules/fetch_from_ppx.smk

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
rule download_ppx_seqs:
2+
output:
3+
sequences= "data/ppx_sequences.fasta",
4+
params:
5+
sequences_url=lambda w: config["ppx_fetch"]["seqs"],
6+
# Allow retries in case of network errors
7+
retries: 5
8+
shell:
9+
"""
10+
curl -fsSL {params.sequences_url:q} -o {output.sequences}
11+
"""
12+
13+
rule download_ppx_meta:
14+
output:
15+
metadata= "data/ppx_metadata.csv"
16+
params:
17+
metadata_url=lambda w: config["ppx_fetch"]["meta"],
18+
fields = ",".join(config["ppx_metadata_fields"])
19+
# Allow retries in case of network errors
20+
retries: 5
21+
shell:
22+
"""
23+
curl -fsSL '{params.metadata_url}&fields={params.fields}' -o {output.metadata}
24+
"""
25+
26+
rule format_ppx_ndjson:
27+
input:
28+
sequences="data/ppx_sequences.fasta",
29+
metadata="data/ppx_metadata.csv"
30+
output:
31+
ndjson="data/ppx.ndjson"
32+
log:
33+
"logs/format_ppx_ndjson.txt"
34+
shell:
35+
"""
36+
augur curate passthru \
37+
--metadata {input.metadata} \
38+
--fasta {input.sequences} \
39+
--seq-id-column accessionVersion \
40+
--seq-field sequence \
41+
--unmatched-reporting warn \
42+
--duplicate-reporting warn \
43+
2> logs/format_ppx_ndjson.txt > {output.ndjson}
44+
"""

ingest/rules/nextclade.smk

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,20 @@ rule join_metadata_and_nextclade:
124124
--output-metadata {output.metadata:q} \
125125
--no-source-columns
126126
"""
127+
128+
rule extract_open_data:
129+
input:
130+
metadata = "results/metadata.tsv",
131+
sequences = "results/sequences.fasta"
132+
output:
133+
metadata = "results/metadata_open.tsv",
134+
sequences = "results/sequences_open.fasta"
135+
shell:
136+
"""
137+
augur filter --metadata {input.metadata} \
138+
--sequences {input.sequences} \
139+
--metadata-id-columns accession \
140+
--exclude-where "dataUseTerms=RESTRICTED" \
141+
--output-metadata {output.metadata} \
142+
--output-sequences {output.sequences}
143+
"""

0 commit comments

Comments
 (0)