added multi-batch processing for deltaG and fixed bug for batch calculation

jonas-fuchs · jonas-fuchs · commit 1bad53676209 · 2026-02-03T10:27:20.000+01:00
diff --git a/varvamp/scripts/primers.py b/varvamp/scripts/primers.py
@@ -9,7 +9,7 @@
 import functools
 
 # LIBS
-from Bio.Seq import Seq
+from Bio.Seq import MutableSeq
 from Bio import SeqIO
 import primer3 as p3
 
@@ -100,9 +100,10 @@ def is_dimer(seq1, seq2):
     check if two sequences dimerize above threshold or are overlapping at their ends
     """
     dimer_result = calc_dimer(seq1, seq2, structure=True)
-
+    # check both the temperature and the deltaG
     if dimer_result.tm > config.PRIMER_MAX_DIMER_TMP or dimer_result.dg < config.PRIMER_MAX_DIMER_DELTAG:
         return True
+    # check for perfect end overlaps (this can result in primer extensions even though the tm/dg are okay)
     if has_end_overlap(dimer_result):
         return True
 
@@ -175,7 +176,7 @@ def rev_complement(seq):
     """
     reverse complement a sequence
     """
-    return str(Seq(seq).reverse_complement())
+    return str(MutableSeq(seq).reverse_complement(inplace=True))
 
 
 def calc_permutation_penalty(amb_seq):
@@ -353,18 +354,18 @@ def _process_kmer_batch(ambiguous_consensus, alignment, kmers):
     for kmer in kmers:
         if not filter_kmer_direction_independent(kmer[0]):
             continue
-
+        # calc penalties
         base_penalty = calc_base_penalty(kmer[0], config.PRIMER_TMP, config.PRIMER_GC_RANGE, config.PRIMER_SIZES)
         per_base_mismatches = calc_per_base_mismatches(kmer, alignment, ambiguous_consensus)
         permutation_penalty = calc_permutation_penalty(ambiguous_consensus[kmer[1]:kmer[2]])
-
+        # some filters depend on the direction of each primer
         for direction in ["+", "-"]:
             if not filter_kmer_direction_dependend(direction, kmer, ambiguous_consensus):
                 continue
-
+            # calc penalties
             three_prime_penalty = calc_3_prime_penalty(direction, per_base_mismatches)
             primer_penalty = base_penalty + permutation_penalty + three_prime_penalty
-
+            # add to lists depending on their direction
             if direction == "+":
                 left_primers.append([kmer[0], kmer[1], kmer[2], primer_penalty, per_base_mismatches])
             else:
@@ -383,7 +384,7 @@ def find_primers(kmers, ambiguous_consensus, alignment, num_processes):
 
     # Convert kmers set to list for slicing
     kmers = list(kmers)
-    batch_size = int(len(kmers)/num_processes)
+    batch_size = max(1, int(len(kmers)/num_processes))
 
     # Split kmers into batches
     batches = [kmers[i:i + batch_size] for i in range(0, len(kmers), batch_size)]
@@ -416,7 +417,7 @@ def create_primer_dictionary(primer_candidates, direction):
     for primer in primer_candidates:
         if direction == "+":
             direction_name = "LEFT"
-        elif direction == "-":
+        else:
             direction_name = "RIGHT"
         primer_name = f"{direction_name}_{primer_idx}"
         primer_dict[primer_name] = primer
@@ -540,6 +541,7 @@ def filter_non_dimer_candidates(primer_candidates, external_sequences, n_process
 
     with multiprocessing.Pool(processes=n_processes) as pool:
         # Prepare arguments based on input type
+        # qpcr probes are stored in dictionaries --> result in tuples when unpacked
         if is_dict:
             results = pool.map(callable_f, primer_candidates.items())
         else:
diff --git a/varvamp/scripts/qpcr.py b/varvamp/scripts/qpcr.py
@@ -99,7 +99,7 @@ def get_qpcr_probes(kmers, ambiguous_consensus, alignment_cleaned, num_processes
     kmers = list(kmers)
 
     # Split kmers into batches
-    batch_size = int(len(kmers) / num_processes)
+    batch_size = max(1, int(len(kmers) / num_processes))
     batches = [kmers[i:i + batch_size] for i in range(0, len(kmers), batch_size)]
 
     # Prepare arguments for each dimer
@@ -282,7 +282,7 @@ def find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidate
     amplicon_nr = -1
 
     # Prepare arguments for parallel processing - pass full primer lists
-    batch_size = int(len(qpcr_probes) / num_processes)
+    batch_size = max(1, int(len(qpcr_probes) / num_processes))
     callable_f = functools.partial(
         find_single_qpcr_scheme,
         left_primer_candidates, right_primer_candidates, qpcr_probes, majority_consensus, ambiguous_consensus
@@ -315,7 +315,7 @@ def find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidate
     return qpcr_scheme_candidates
 
 
-def process_single_amplicon_deltaG(amplicon, majority_consensus):
+def process_single_amplicon_deltaG(majority_consensus, amplicon):
     """
     Process a single amplicon to test its deltaG and apply filtering.
     This function will be called concurrently by multiple threads.
@@ -341,32 +341,33 @@ def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n
     """
     final_amplicons = []
 
-    # Create a pool of processes to handle the concurrent processing
+    # Create a list of the first n amplicon tuples for processing
+    # The list is sorted first on whether offset targets were predicted for the amplicon,
+    # then by penalty. This ensures that amplicons with offset targets are always considered last
+    amplicons = list(sorted(qpcr_schemes_candidates, key=lambda x: (x.get("offset_targets", False), x["penalty"])))
+    # process amplicons concurrently
+    batch_size = max(1, int(n_to_test / n_processes))
+    callable_f = functools.partial(
+        process_single_amplicon_deltaG,
+        majority_consensus
+    )
     with multiprocessing.Pool(processes=n_processes) as pool:
-        # Create a list of the first n amplicon tuples for processing
-        # The list is sorted first on whether offset targets were predicted for the amplicon,
-        # then by penalty. This ensures that amplicons with offset targets are always considered last
-        amplicons = itertools.islice(
-            sorted(qpcr_schemes_candidates, key=lambda x: (x.get("offset_targets", False), x["penalty"])),
-            n_to_test
-        )
-        # process amplicons concurrently
-        results = pool.starmap(process_single_amplicon_deltaG, [(amp, majority_consensus) for amp in amplicons])
-        # Process the results
-        retained_ranges = []
-        for amp in results:
-            # check if the amplicon overlaps with an amplicon that was previously
-            # found and had a high enough deltaG
-            if amp["deltaG"] <= deltaG_cutoff:
-                continue
-            amp_range = range(amp["LEFT"][1], amp["RIGHT"][2])
-            overlaps_retained = False
-            for r in retained_ranges:
-                if amp_range.start < r.stop and r.start < amp_range.stop:
-                    overlaps_retained = True
-                    break
-            if not overlaps_retained:
-                final_amplicons.append(amp)
-                retained_ranges.append(amp_range)
+        results = pool.map(callable_f, amplicons, chunksize=batch_size)
+    # Process the results
+    retained_ranges = []
+    for amp in results:
+        # check if the amplicon overlaps with an amplicon that was previously
+        # found and had a high enough deltaG
+        if amp["deltaG"] <= deltaG_cutoff:
+            continue
+        amp_range = range(amp["LEFT"][1], amp["RIGHT"][2])
+        overlaps_retained = False
+        for r in retained_ranges:
+            if amp_range.start < r.stop and r.start < amp_range.stop:
+                overlaps_retained = True
+                break
+        if not overlaps_retained:
+            final_amplicons.append(amp)
+            retained_ranges.append(amp_range)
 
     return final_amplicons
diff --git a/varvamp/scripts/scheme.py b/varvamp/scripts/scheme.py
@@ -299,7 +299,10 @@ def test_scheme_for_dimers(amplicon_scheme):
 
     primer_dimers = []
     non_dimers = {amp["pool"]:set() for amp in amplicon_scheme}
-    # write all primer sequences in the respective pools
+    # write all primer sequences in the respective pools -->
+    # these primers should not be violated by primer switching
+    # and primers are only switched later if no primer dimers
+    # with the existing 'good' scheme are created
     for amp in amplicon_scheme:
         non_dimers[amp["pool"]].add(amp["LEFT"][0])
         non_dimers[amp["pool"]].add(amp["RIGHT"][0])
@@ -324,6 +327,7 @@ def test_scheme_for_dimers(amplicon_scheme):
                 # and remember all tested primers
                 tested_primers.append(current_primer)
 
+    # report both dimers and non-dimers
     return primer_dimers, non_dimers