Merge branch 'master' into scheme_compatibility

jonas-fuchs · web-flow · commit 714834a7caf8 · 2026-01-26T11:02:54.000+01:00
diff --git a/docs/how_varvamp_works.md b/docs/how_varvamp_works.md
@@ -23,8 +23,7 @@ varVAMP searches for potential primer regions as defined by a user-defined numbe
 varVAMP uses [`primer3-py`](https://pypi.org/project/primer3-py/) to search for potential primers. Some of the evaluation process, determining if primers match certain criteria, was adapted from [`primalscheme`](https://github.com/aresti/primalscheme). The primer search contains multiple steps:
 1. Digest the primer regions into kmers with the min and max length of primers. This is performed on a consensus sequence that does not contain ambiguous characters but is just the majority consensus of the alignment. Therefore, primer parameters will be later calculated for the best fitting primer.
 2. Evaluate if these kmers are potential primers independent of their orientation (temperature, GC, size, poly-x repeats and poly dinucleotide repeats) and dependent on their orientation (secondary structure, GC clamp, number of GCs in the last 5 bases of the 3' end and min 3' nucleotides without an ambiguous base). Filter for kmers that satisfy all constraints and calculate their penalties (explained in the last section).
-3. Single and tiled mode: Find primer with the lowest penalty. varVAMP sorts the primers by their penalty and always takes one with the lowest penalty if middle third of the primer has not been covered by a primer with a lower penalty. This greatly reduces the complexity of the later amplicon search while only retaining the best primer of a set of overlapping primers.
-
+3. Single and tiled mode: Find primer with the lowest penalty. varVAMP sorts the primers by their penalty and always takes one with the lowest penalty if middle third of the primer has not been covered by a primer with a lower penalty. This greatly reduces the complexity of the later amplicon search while only retaining the best primer of a set of overlapping primers. If the percentage of potential primer regions exceeds 90% of the genome, varVAMP switches to a stringent mode and excludes all overlapping primers to again reduce complexity. 
 ### Amplicon search
 
 #### Amplicon-tiling
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = 'varvamp'
-version = '1.2.3'
+version = '1.3'
 description = 'Variable VirusAMPlicons (varVAMP) is a tool to design primers for highly diverse viruses.'
 keywords = ['pcr, tiled pcr, primer-tiling, qpcr, primer design']
 dependencies = ["biopython>=1.79", "matplotlib>=3.5.1", "primer3-py>=1.1.0", "pandas>=1.4.4", "numpy>=1.23.3", "seqfold>=0.7.15"]
diff --git a/varvamp/command.py b/varvamp/command.py
@@ -256,6 +256,9 @@ def shared_workflow(args, log_file):
         ambiguous_consensus,
         args.n_ambig
     )
+
+    potential_primer_regions = regions.mean(primer_regions, majority_consensus)
+
     if not primer_regions:
         logging.raise_error(
             "no primer regions found. Lower the threshold!",
@@ -266,7 +269,7 @@ def shared_workflow(args, log_file):
         log_file,
         progress=0.4,
         job="Finding primer regions.",
-        progress_text=f"{regions.mean(primer_regions, majority_consensus)} % of the consensus sequence will be evaluated for primers"
+        progress_text=f"{potential_primer_regions} % of the consensus sequence will be evaluated for primers"
     )
 
     # produce kmers for all primer regions
@@ -316,20 +319,26 @@ def shared_workflow(args, log_file):
             progress_text=f"{len(left_primer_candidates)} fw and {len(right_primer_candidates)} rv primers after filtering"
         )
 
-    return alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, compatible_primers
+    return alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, potential_primer_regions, compatible_primers
 
 
-def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_candidates, data_dir, log_file):
+def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_candidates, potential_primer_regions, data_dir, log_file):
     """
     part of the workflow shared by the single and tiled mode
     """
 
     # find best primers and create primer dict
-    all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates)
+    # depending on the percentage of potential primer regions use high conservation mode
+    if potential_primer_regions >= 90:
+        all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation=True)
+        job_text = "Excluding overlapping primers (stringent)."
+    else:
+        all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation=False)
+        job_text = "Excluding overlapping primers."
     logging.varvamp_progress(
         log_file,
         progress=0.7,
-        job="Considering primers with low penalties.",
+        job=f"{job_text}",
         progress_text=f"{len(all_primers['+'])} fw and {len(all_primers['-'])} rv primers"
     )
 
@@ -544,7 +553,7 @@ def main():
         blast.check_BLAST_installation(log_file)
 
     # mode unspecific part of the workflow
-    alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, compatible_primers = shared_workflow(args, log_file)
+    alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, potential_primer_regions, compatible_primers = shared_workflow(args, log_file)
 
     # write files that are shared in all modes
     reporting.write_regions_to_bed(primer_regions, args.name, data_dir)
@@ -570,6 +579,7 @@ def main():
             args,
             left_primer_candidates,
             right_primer_candidates,
+            potential_primer_regions,
             data_dir,
             log_file
         )
diff --git a/varvamp/scripts/param_estimation.py b/varvamp/scripts/param_estimation.py
@@ -114,7 +114,11 @@ def get_parameters(preprocessed_alignment, args, log_file):
             # check if the distance is acceptable
             distance_threshold = args.opt_length - 2 * args.overlap if args.mode == 'tiled' else args.opt_length
             if max_distance_between_passing < distance_threshold:
-                args.threshold += 0.01
+                # never exceed 0.99
+                if args.threshold < 0.99:
+                    args.threshold += 0.01
+                else:
+                    break
             # or reset to the param of the two previous iterations
             else:
                 args.threshold -= 0.02
diff --git a/varvamp/scripts/primers.py b/varvamp/scripts/primers.py
@@ -463,7 +463,7 @@ def create_primer_dictionary(primer_candidates, direction):
     return primer_dict
 
 
-def find_best_primers(left_primer_candidates, right_primer_candidates):
+def find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation:bool=False):
     """
     Primer candidates are likely overlapping. Here, the list of primers
     is sorted for the lowest to highest penalty. Then, the next lowest
@@ -489,16 +489,20 @@ def find_best_primers(left_primer_candidates, right_primer_candidates):
         primer_candidates.sort(key=lambda x: (x[3], x[1]))
         # ini everything with the primer with the lowest penalty
         to_retain = [primer_candidates[0]]
-        primer_ranges = list(range(primer_candidates[0][1], primer_candidates[0][2]))
-        primer_set = set(primer_ranges)
+        primer_set = set(range(primer_candidates[0][1], primer_candidates[0][2]))
 
-        for primer in primer_candidates:
+        for primer in primer_candidates[1:]:
+            # for highly conserved alignments exclude everything that overlaps with the best primer
+            # this reduces graph complexity by quite a large margin
+            if high_conservation:
+                primer_positions =set(range(primer[1], primer[2]))
             # get the thirds of the primer, only consider the middle
-            thirds_len = int((primer[2] - primer[1])/3)
-            primer_positions = list(range(primer[1] + thirds_len, primer[2] - thirds_len))
+            else:
+                thirds_len = int((primer[2] - primer[1])/3)
+                primer_positions = set(range(primer[1] + thirds_len, primer[2] - thirds_len))
             # check if none of the nucleotides of the next primer
             # are already covered by a better primer
-            if not any(x in primer_positions for x in primer_set):
+            if primer_set.isdisjoint(primer_positions):
                 # update the primer set
                 primer_set.update(primer_positions)
                 # append this primer as it has a low penalty and is not overlapping