Skip to content

Commit 5cccbd9

Browse files
committed
ingest: add url curation
1 parent 056f021 commit 5cccbd9

File tree

3 files changed

+46
-0
lines changed

3 files changed

+46
-0
lines changed

ingest/bin/curate-urls.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""custom curate script to add URLs"""
2+
import sys
3+
import argparse
4+
from typing import Iterable
5+
6+
from augur.curate import validate_records
7+
from augur.io.json import dump_ndjson, load_ndjson
8+
9+
def run(args: argparse.Namespace, records: Iterable[dict]) -> Iterable[dict]:
10+
11+
for index, record in enumerate(records):
12+
record = record.copy()
13+
14+
ppx_accession = record.get('PPX_accession', None) # versioned
15+
insdc_accession = record.get('INSDC_accession', None) # versioned
16+
17+
# Add INSDC_accession__url and PPX_accession__url fields to NDJSON records
18+
record['PPX_accession__url'] = f"https://pathoplexus.org/seq/{ppx_accession}" \
19+
if ppx_accession \
20+
else ""
21+
record['INSDC_accession__url'] = f"https://www.ncbi.nlm.nih.gov/nuccore/{insdc_accession}" \
22+
if insdc_accession \
23+
else ""
24+
25+
yield record
26+
27+
28+
if __name__ == "__main__":
29+
parser = argparse.ArgumentParser(description=__doc__)
30+
args = parser.parse_args()
31+
32+
records = load_ndjson(sys.stdin)
33+
34+
# Validate records have the same input fields
35+
validated_input_records = validate_records(records, __doc__, True)
36+
37+
# Run this custom curate command to get modified records
38+
modified_records = run(args, validated_input_records)
39+
40+
# Validate modified records have the same output fields
41+
validated_output_records = validate_records(modified_records, __doc__, False)
42+
43+
dump_ndjson(validated_output_records)

ingest/defaults/config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,9 @@ curate:
147147
'date',
148148
'accession', # unversioned PPX accession
149149
'PPX_accession',
150+
'PPX_accession__url',
150151
'INSDC_accession',
152+
'INSDC_accession__url',
151153
'region',
152154
'country',
153155
'division',

ingest/rules/curate.smk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ rule curate:
8282
--abbr-authors-field {params.abbr_authors_field:q} \
8383
| augur curate apply-geolocation-rules \
8484
--geolocation-rules {input.geolocation_rules:q} \
85+
| python ./bin/curate-urls.py \
8586
| augur curate apply-record-annotations \
8687
--annotations {input.annotations:q} \
8788
--id-field {params.annotations_id:q} \

0 commit comments

Comments
 (0)