@@ -54,23 +54,25 @@ def positive_int(value):
5454 "This is useful to exclude sequence counts for recent days that are overly enriched for variants." )
5555 parser .add_argument ("--location-min-seq" , type = positive_int , default = 1 ,
5656 help = "The mininum number of sequences a location must have within the "
57- "days -min-seq to be included in analysis.\n "
57+ "location -min-seq-days to be included in analysis.\n "
5858 "(default: %(default)s)" )
5959 parser .add_argument ("--location-min-seq-days" , type = positive_int ,
6060 help = "The number of days (counting back from the cutoff date) to use as the date range "
6161 "for counting the number of sequences per location to determine if a location is included in analysis.\n "
6262 "If not provided, will count sequences from all dates included in analysis date range." )
6363 parser .add_argument ("--excluded-locations" ,
64- help = "File with a list locations to exclude from analysis." )
64+ help = "File with a list locations to always exclude from analysis." )
65+ parser .add_argument ("--included-locations" ,
66+ help = "File with a list locations to always include in analysis." )
6567 parser .add_argument ("--clade-min-seq" , type = positive_int ,
66- help = "The minimum number of sequences a clades must have to be included as it's own variant.\n "
68+ help = "The minimum number of sequences a clades must have to be included as its own variant.\n "
6769 "All clades with less than the minimum will be collapsed as 'other'." )
6870 parser .add_argument ("--clade-min-seq-days" , type = positive_int ,
69- help = "The number fo days (counting back from the cutoff date) to use as the date range "
71+ help = "The number of days (counting back from the cutoff date) to use as the date range "
7072 "for counting the number of sequences per clade to determine if a clade is included as its own variant.\n "
7173 "If not provided, will count sequences from all dates included in analysis date range." )
7274 parser .add_argument ("--force-include-clades" , nargs = "*" ,
73- help = "Clades to force include in the output regardless of sequences counts. " +
75+ help = "Clades to force include in the output regardless of sequence counts. " +
7476 "Must be formatted as <clade_name>=<variant_name>" )
7577 parser .add_argument ("--output-seq-counts" , required = True ,
7678 help = "Path to output TSV file for the prepared variants data." )
@@ -131,6 +133,7 @@ def positive_int(value):
131133
132134 # Get a set of locations that meet the location_min_seq requirement
133135 locations_with_min_seq = set (seqs_per_location .loc [seqs_per_location ['sequences' ] >= args .location_min_seq , 'location' ])
136+ locations_with_min_tenth_seq = set (seqs_per_location .loc [seqs_per_location ['sequences' ] >= args .location_min_seq / 10 , 'location' ])
134137
135138 # Load manually annotated excluded locations if provided
136139 excluded_locations = set ()
@@ -140,8 +143,16 @@ def positive_int(value):
140143
141144 print (f"Excluding the following requested locations: { sorted (excluded_locations )} ." )
142145
146+ # Load manually annotated excluded locations if provided
147+ included_locations = set ()
148+ if args .included_locations :
149+ with open (args .included_locations , 'r' ) as f :
150+ included_locations = {line .rstrip () for line in f } & locations_with_min_tenth_seq
151+
152+ print (f"Including the following requested locations: { sorted (included_locations )} ." )
153+
143154 # Remove excluded-locations from the set of locations to include in analysis
144- locations_to_include = locations_with_min_seq - excluded_locations
155+ locations_to_include = locations_with_min_seq - excluded_locations | included_locations
145156 print (f"Locations that will be included: { sorted (locations_to_include )} ." )
146157
147158 assert len (locations_to_include ) > 0 , \
0 commit comments