Skip to content

Commit db423bb

Browse files
Include more countries, especially some big ones with force include
Relax the location_min_seq_days as many important countries have quite some delay but are still informative/important
1 parent 66b6d7f commit db423bb

File tree

4 files changed

+29
-9
lines changed

4 files changed

+29
-9
lines changed

config/config.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,20 @@ prepare_data:
1616
nextstrain_clades:
1717
global:
1818
included_days: 150
19-
location_min_seq: 100
20-
location_min_seq_days: 30
19+
location_min_seq: 200
20+
location_min_seq_days: 70
2121
excluded_locations: "defaults/global_excluded_locations.txt"
22+
included_locations: "defaults/global_included_locations.txt"
2223
prune_seq_days: 12
2324
clade_min_seq: 5000
2425
clade_min_seq_days: 150
2526
pango_lineages:
2627
global:
2728
included_days: 150
2829
location_min_seq: 300
29-
location_min_seq_days: 30
30+
location_min_seq_days: 100
3031
excluded_locations: "defaults/global_excluded_locations.txt"
32+
included_locations: "defaults/global_included_locations.txt"
3133
prune_seq_days: 12
3234
clade_min_seq: 1
3335
clade_min_seq_days: 150
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
India
2+
South Africa
3+
Brazil
4+
Malaysia
5+
Thailand

scripts/prepare-data.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,23 +54,25 @@ def positive_int(value):
5454
"This is useful to exclude sequence counts for recent days that are overly enriched for variants.")
5555
parser.add_argument("--location-min-seq", type=positive_int, default=1,
5656
help="The mininum number of sequences a location must have within the "
57-
"days-min-seq to be included in analysis.\n"
57+
"location-min-seq-days to be included in analysis.\n"
5858
"(default: %(default)s)")
5959
parser.add_argument("--location-min-seq-days", type=positive_int,
6060
help="The number of days (counting back from the cutoff date) to use as the date range "
6161
"for counting the number of sequences per location to determine if a location is included in analysis.\n"
6262
"If not provided, will count sequences from all dates included in analysis date range.")
6363
parser.add_argument("--excluded-locations",
64-
help="File with a list locations to exclude from analysis.")
64+
help="File with a list locations to always exclude from analysis.")
65+
parser.add_argument("--included-locations",
66+
help="File with a list locations to always include in analysis.")
6567
parser.add_argument("--clade-min-seq", type=positive_int,
66-
help="The minimum number of sequences a clades must have to be included as it's own variant.\n"
68+
help="The minimum number of sequences a clades must have to be included as its own variant.\n"
6769
"All clades with less than the minimum will be collapsed as 'other'.")
6870
parser.add_argument("--clade-min-seq-days", type=positive_int,
69-
help="The number fo days (counting back from the cutoff date) to use as the date range "
71+
help="The number of days (counting back from the cutoff date) to use as the date range "
7072
"for counting the number of sequences per clade to determine if a clade is included as its own variant.\n"
7173
"If not provided, will count sequences from all dates included in analysis date range.")
7274
parser.add_argument("--force-include-clades", nargs="*",
73-
help="Clades to force include in the output regardless of sequences counts. " +
75+
help="Clades to force include in the output regardless of sequence counts. " +
7476
"Must be formatted as <clade_name>=<variant_name>")
7577
parser.add_argument("--output-seq-counts", required=True,
7678
help="Path to output TSV file for the prepared variants data.")
@@ -131,6 +133,7 @@ def positive_int(value):
131133

132134
# Get a set of locations that meet the location_min_seq requirement
133135
locations_with_min_seq = set(seqs_per_location.loc[seqs_per_location['sequences'] >= args.location_min_seq, 'location'])
136+
locations_with_min_tenth_seq = set(seqs_per_location.loc[seqs_per_location['sequences'] >= args.location_min_seq / 10, 'location'])
134137

135138
# Load manually annotated excluded locations if provided
136139
excluded_locations = set()
@@ -140,8 +143,16 @@ def positive_int(value):
140143

141144
print(f"Excluding the following requested locations: {sorted(excluded_locations)}.")
142145

146+
# Load manually annotated excluded locations if provided
147+
included_locations = set()
148+
if args.included_locations:
149+
with open(args.included_locations, 'r') as f:
150+
included_locations = {line.rstrip() for line in f} & locations_with_min_tenth_seq
151+
152+
print(f"Including the following requested locations: {sorted(included_locations)}.")
153+
143154
# Remove excluded-locations from the set of locations to include in analysis
144-
locations_to_include = locations_with_min_seq - excluded_locations
155+
locations_to_include = locations_with_min_seq - excluded_locations | included_locations
145156
print(f"Locations that will be included: {sorted(locations_to_include)}.")
146157

147158
assert len(locations_to_include) > 0, \

workflow/snakemake_rules/prepare_data.smk

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ rule prepare_clade_data:
5959
included_days = lambda wildcards: _get_prepare_data_option(wildcards, 'included_days'),
6060
location_min_seq = lambda wildcards: _get_prepare_data_option(wildcards, 'location_min_seq'),
6161
location_min_seq_days = lambda wildcards: _get_prepare_data_option(wildcards, 'location_min_seq_days'),
62+
included_locations = lambda wildcards: _get_prepare_data_option(wildcards, 'included_locations'),
6263
excluded_locations = lambda wildcards: _get_prepare_data_option(wildcards, 'excluded_locations'),
6364
prune_seq_days = lambda wildcards: _get_prepare_data_option(wildcards, 'prune_seq_days'),
6465
clade_min_seq = lambda wildcards: _get_prepare_data_option(wildcards, 'clade_min_seq'),
@@ -74,6 +75,7 @@ rule prepare_clade_data:
7475
{params.location_min_seq} \
7576
{params.location_min_seq_days} \
7677
{params.excluded_locations} \
78+
{params.included_locations} \
7779
{params.prune_seq_days} \
7880
{params.clade_min_seq} \
7981
{params.clade_min_seq_days} \

0 commit comments

Comments
 (0)