Skip to content

Commit 4e83871

Browse files
authored
Merge additional data during phylogenetic (instead of ingest) workflow #68
2 parents 2118fd0 + bfb6d5b commit 4e83871

File tree

18 files changed

+191
-132
lines changed

18 files changed

+191
-132
lines changed

.github/workflows/phylogenetic.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ jobs:
124124
nextstrain build \
125125
phylogenetic \
126126
deploy_all \
127-
--configfile build-configs/nextstrain-automation/config.yaml build-configs/washington-state/config.yaml build-configs/washington-state/config-public.yaml \
127+
--configfile build-configs/nextstrain-automation/config.yaml build-configs/washington-state/config.yaml \
128128
$CONFIG_OVERRIDES
129129
# Specifying artifact name to differentiate ingest build outputs from
130130
# the phylogenetic build outputs

ingest/defaults/annotations.tsv

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,3 @@
1-
MH507691 location San Diego
2-
OQ721209 location Yolo
3-
OQ721236 location Sacramento
4-
KX547286 location Chautauqua
5-
KX547169 location Oswego
6-
OQ721825 location Larimer
7-
MH507691 latitude 32.715736
8-
OQ721209 latitude 38.732967
9-
OQ721236 latitude 38.575764
10-
KX547286 latitude 42.209869
11-
KX547169 latitude 43.455345
12-
OQ721825 latitude 40.66641
13-
MH507691 longitude -117.161087
14-
OQ721209 longitude -121.807281
15-
OQ721236 longitude -121.478851
16-
KX547286 longitude -79.470428
17-
KX547169 longitude -76.510498
18-
OQ721825 longitude -105.46116
191
AF380138 country Democratic Republic of the Congo
202
AY741551 country Sierra Leone
213
DQ011153 country USA

ingest/defaults/config.yaml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,6 @@ curate:
9999
output_sequence_field: "sequence"
100100
# The field in the NDJSON record that contains the actual GenBank accession
101101
genbank_accession: 'accession'
102-
# Added metadata columns
103-
added_columns: [
104-
'longitude',
105-
'latitude'
106-
]
107102

108103
# The list of metadata columns to keep in the final output of the curation pipeline.
109104
metadata_columns: [
@@ -128,8 +123,6 @@ curate:
128123
#'full_authors',
129124
#'reverse',
130125
'authors',
131-
'latitude',
132-
'longitude',
133126
#'institution',
134127
#'title',
135128
#'journal',

ingest/rules/curate.smk

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ rule curate:
6868
authors_default_value=config["curate"]["authors_default_value"],
6969
abbr_authors_field=config["curate"]["abbr_authors_field"],
7070
annotations_id=config["curate"]["annotations_id"],
71-
added_columns=config["curate"]["added_columns"],
7271
id_field=config["curate"]["output_id_field"],
7372
sequence_field=config["curate"]["output_sequence_field"],
7473
shell:
@@ -97,8 +96,6 @@ rule curate:
9796
--geolocation-rules {input.all_geolocation_rules} \
9897
| ./scripts/transform-state-names \
9998
| ./scripts/post_process_metadata.py \
100-
| ./scripts/add-field-names \
101-
--metadata-columns {params.added_columns} \
10299
| ./scripts/transform-new-fields \
103100
--map-tsv {input.manual_mapping} \
104101
--map-id host \

ingest/scripts/add-field-names

Lines changed: 0 additions & 27 deletions
This file was deleted.

phylogenetic/Snakefile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,11 @@ rule all:
3434
# custom_rules imported below to ensure that the core workflow is not complicated
3535
# by build specific rules.
3636

37+
include: "rules/merge_additional_inputs.smk"
38+
include: "rules/prepare_sequences.smk"
3739
include: "rules/subsampling_manual.smk"
38-
3940
if config.get("subsampling", False):
4041
include: "rules/subsampling_configurable.smk"
41-
42-
include: "rules/prepare_sequences.smk"
4342
include: "rules/construct_phylogeny.smk"
4443
include: "rules/annotate_phylogeny.smk"
4544
include: "rules/export.smk"

phylogenetic/build-configs/ci/config.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
# for the CI workflow to run with the example data.
33

44
# Pull in metadata and sequences from the example_data directory
5-
input_metadata: "example_data/metadata.tsv"
6-
input_sequences: "example_data/sequences.fasta"
5+
inputs:
6+
- name: example
7+
metadata: "example_data/metadata.tsv"
8+
sequences: "example_data/sequences.fasta"
79

810
## Custom rules to run as part of the CI automated workflow
911
## The paths should be relative to the phylogenetic directory.

phylogenetic/build-configs/ci/copy_example_data.smk

Lines changed: 0 additions & 17 deletions
This file was deleted.

phylogenetic/build-configs/washington-state/config-public.yaml

Lines changed: 0 additions & 9 deletions
This file was deleted.

phylogenetic/build-configs/washington-state/config.yaml

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,17 @@ reference: "defaults/wa/reference.gb"
66
# Use 'IS88' as the root strain on the phylogenetic tree to place samples within the global context
77
root: "AF481864"
88

9-
# Pull in metadata and sequences from the ingest directory after it has been annotated with washington-state specific metadata
10-
input_metadata: "../ingest/results/metadata.tsv"
11-
input_sequences: "../ingest/results/sequences.fasta"
9+
# Sequences must be FASTA and metadata must be TSV
10+
# Both files must be zstd compressed
11+
inputs:
12+
- name: ncbi
13+
metadata: "s3://nextstrain-data/files/workflows/WNV/metadata.tsv.zst"
14+
sequences: "s3://nextstrain-data/files/workflows/WNV/sequences.fasta.zst"
15+
16+
additional_inputs:
17+
- name: private-lat-longs
18+
metadata: "data-private/metadata.tsv"
19+
sequences: "data-private/sequences.fasta"
1220

1321
builds: ['wa']
1422

0 commit comments

Comments
 (0)