Include more countries, especially some big ones with force include

corneliusroemer · corneliusroemer · commit db423bb62c07 · 2023-09-28T00:33:46.000+02:00
Relax the location_min_seq_days as many important countries have quite some delay
but are still informative/important
diff --git a/config/config.yaml b/config/config.yaml
@@ -16,18 +16,20 @@ prepare_data:
     nextstrain_clades:
       global:
         included_days: 150
-        location_min_seq: 100
-        location_min_seq_days: 30
+        location_min_seq: 200
+        location_min_seq_days: 70
         excluded_locations: "defaults/global_excluded_locations.txt"
+        included_locations: "defaults/global_included_locations.txt"
         prune_seq_days: 12
         clade_min_seq: 5000
         clade_min_seq_days: 150
     pango_lineages:
       global:
         included_days: 150
         location_min_seq: 300
-        location_min_seq_days: 30
+        location_min_seq_days: 100
         excluded_locations: "defaults/global_excluded_locations.txt"
+        included_locations: "defaults/global_included_locations.txt"
         prune_seq_days: 12
         clade_min_seq: 1
         clade_min_seq_days: 150
diff --git a/defaults/global_included_locations.txt b/defaults/global_included_locations.txt
@@ -0,0 +1,5 @@
+India
+South Africa
+Brazil
+Malaysia
+Thailand
diff --git a/scripts/prepare-data.py b/scripts/prepare-data.py
@@ -54,23 +54,25 @@ def positive_int(value):
              "This is useful to exclude sequence counts for recent days that are overly enriched for variants.")
     parser.add_argument("--location-min-seq", type=positive_int, default=1,
         help="The mininum number of sequences a location must have within the "
-             "days-min-seq to be included in analysis.\n"
+             "location-min-seq-days to be included in analysis.\n"
              "(default: %(default)s)")
     parser.add_argument("--location-min-seq-days", type=positive_int,
         help="The number of days (counting back from the cutoff date) to use as the date range "
              "for counting the number of sequences per location to determine if a location is included in analysis.\n"
              "If not provided, will count sequences from all dates included in analysis date range.")
     parser.add_argument("--excluded-locations",
-        help="File with a list locations to exclude from analysis.")
+        help="File with a list locations to always exclude from analysis.")
+    parser.add_argument("--included-locations",
+        help="File with a list locations to always include in analysis.")
     parser.add_argument("--clade-min-seq", type=positive_int,
-        help="The minimum number of sequences a clades must have to be included as it's own variant.\n"
+        help="The minimum number of sequences a clades must have to be included as its own variant.\n"
              "All clades with less than the minimum will be collapsed as 'other'.")
     parser.add_argument("--clade-min-seq-days", type=positive_int,
-        help="The number fo days (counting back from the cutoff date) to use as the date range "
+        help="The number of days (counting back from the cutoff date) to use as the date range "
              "for counting the number of sequences per clade to determine if a clade is included as its own variant.\n"
              "If not provided, will count sequences from all dates included in analysis date range.")
     parser.add_argument("--force-include-clades", nargs="*",
-        help="Clades to force include in the output regardless of sequences counts. " +
+        help="Clades to force include in the output regardless of sequence counts. " +
              "Must be formatted as <clade_name>=<variant_name>")
     parser.add_argument("--output-seq-counts", required=True,
         help="Path to output TSV file for the prepared variants data.")
@@ -131,6 +133,7 @@ def positive_int(value):
 
     # Get a set of locations that meet the location_min_seq requirement
     locations_with_min_seq = set(seqs_per_location.loc[seqs_per_location['sequences'] >= args.location_min_seq, 'location'])
+    locations_with_min_tenth_seq = set(seqs_per_location.loc[seqs_per_location['sequences'] >= args.location_min_seq / 10, 'location'])
 
     # Load manually annotated excluded locations if provided
     excluded_locations = set()
@@ -140,8 +143,16 @@ def positive_int(value):
 
         print(f"Excluding the following requested locations: {sorted(excluded_locations)}.")
 
+    # Load manually annotated excluded locations if provided
+    included_locations = set()
+    if args.included_locations:
+        with open(args.included_locations, 'r') as f:
+            included_locations = {line.rstrip() for line in f} & locations_with_min_tenth_seq
+
+        print(f"Including the following requested locations: {sorted(included_locations)}.")
+
     # Remove excluded-locations from the set of locations to include in analysis
-    locations_to_include = locations_with_min_seq - excluded_locations
+    locations_to_include = locations_with_min_seq - excluded_locations | included_locations
     print(f"Locations that will be included: {sorted(locations_to_include)}.")
 
     assert len(locations_to_include) > 0, \
diff --git a/workflow/snakemake_rules/prepare_data.smk b/workflow/snakemake_rules/prepare_data.smk
@@ -59,6 +59,7 @@ rule prepare_clade_data:
         included_days = lambda wildcards: _get_prepare_data_option(wildcards, 'included_days'),
         location_min_seq = lambda wildcards: _get_prepare_data_option(wildcards, 'location_min_seq'),
         location_min_seq_days = lambda wildcards: _get_prepare_data_option(wildcards, 'location_min_seq_days'),
+        included_locations = lambda wildcards: _get_prepare_data_option(wildcards, 'included_locations'),
         excluded_locations = lambda wildcards: _get_prepare_data_option(wildcards, 'excluded_locations'),
         prune_seq_days = lambda wildcards: _get_prepare_data_option(wildcards, 'prune_seq_days'),
         clade_min_seq = lambda wildcards: _get_prepare_data_option(wildcards, 'clade_min_seq'),
@@ -74,6 +75,7 @@ rule prepare_clade_data:
             {params.location_min_seq} \
             {params.location_min_seq_days} \
             {params.excluded_locations} \
+            {params.included_locations} \
             {params.prune_seq_days} \
             {params.clade_min_seq} \
             {params.clade_min_seq_days} \