finalized review from wm75

jonas-fuchs · jonas-fuchs · commit 549d48b2deea · 2026-01-31T15:17:05.000+01:00
diff --git a/varvamp/command.py b/varvamp/command.py
@@ -195,6 +195,11 @@ def shared_workflow(args, log_file):
     # read in external primer sequences with which new primers should not form dimers
     if args.compatible_primers is not None:
         compatible_primers = primers.parse_primer_fasta(args.compatible_primers)
+        if not compatible_primers:
+            logging.raise_error(
+                "no valid primers found in the provided primer file.\n",
+                log_file,
+            )
     else:
         compatible_primers = None
     # check alignment length and number of gaps and report if its significantly more/less than expected
@@ -304,7 +309,7 @@ def shared_workflow(args, log_file):
     )
 
     # filter primers against user-provided list of compatible primers, can use multi-processing
-    if compatible_primers is not None:
+    if compatible_primers:
         left_primer_candidates = primers.filter_non_dimer_candidates(
             left_primer_candidates, compatible_primers, args.threads
         )
@@ -394,7 +399,7 @@ def single_workflow(args, amplicons, log_file):
     return amplicon_scheme
 
 
-def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file, results_dir):
+def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file):
     """
     part of the workflow specific for the tiled mode
     """
@@ -452,9 +457,8 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
             job="Trying to solve primer dimers.",
             progress_text=f"{len(dimers_not_solved)}/{n_initial_dimers} dimers could not be resolved"
         )
-        reporting.write_dimers(results_dir, dimers_not_solved)
 
-    return amplicon_scheme
+    return amplicon_scheme, dimers_not_solved
 
 
 def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majority_consensus, left_primer_candidates, right_primer_candidates, compatible_primers, log_file):
@@ -495,7 +499,7 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
     )
 
     # filter primers against non-dimer sequences if provided
-    if compatible_primers is not None:
+    if compatible_primers:
         qpcr_probes = primers.filter_non_dimer_candidates(
             qpcr_probes, compatible_primers, args.threads)
         logging.varvamp_progress(
@@ -591,6 +595,7 @@ def main():
 
     # SINGLE/TILED mode
     if args.mode == "tiled" or args.mode == "single":
+        dimers_not_solved = None
         all_primers, amplicons = single_and_tiled_shared_workflow(
             args,
             left_primer_candidates,
@@ -606,15 +611,14 @@ def main():
                 log_file
             )
         elif args.mode == "tiled":
-            amplicon_scheme = tiled_workflow(
+            amplicon_scheme, dimers_not_solved = tiled_workflow(
                 args,
                 amplicons,
                 left_primer_candidates,
                 right_primer_candidates,
                 all_primers,
                 ambiguous_consensus,
                 log_file,
-                results_dir
             )
 
         # write files
@@ -631,7 +635,8 @@ def main():
             ambiguous_consensus,
             args.name,
             args.mode,
-            log_file
+            log_file,
+            dimers_not_solved
         )
         reporting.varvamp_plot(
             results_dir,
diff --git a/varvamp/scripts/primers.py b/varvamp/scripts/primers.py
@@ -6,6 +6,7 @@
 import itertools
 import re
 import multiprocessing
+import functools
 
 # LIBS
 from Bio.Seq import Seq
@@ -341,12 +342,11 @@ def filter_kmer_direction_dependend(direction, kmer, ambiguous_consensus):
     )
 
 
-def _process_kmer_batch(args):
+def _process_kmer_batch(ambiguous_consensus, alignment, kmers):
     """
     Helper function for multiprocessing: process a batch of kmers.
     Returns (left_primers, right_primers) tuples.
     """
-    kmers, ambiguous_consensus, alignment = args
     left_primers = []
     right_primers = []
 
@@ -387,11 +387,16 @@ def find_primers(kmers, ambiguous_consensus, alignment, num_processes):
 
     # Split kmers into batches
     batches = [kmers[i:i + batch_size] for i in range(0, len(kmers), batch_size)]
-    args_list = [(batch, ambiguous_consensus, alignment) for batch in batches]
 
-    # Process batches in parallel
+    # Prepare arguments for each dimer
+    callable_f = functools.partial(
+        _process_kmer_batch,
+        ambiguous_consensus, alignment
+    )
+
+    # Solve dimers in parallel
     with multiprocessing.Pool(processes=num_processes) as pool:
-        results = pool.map(_process_kmer_batch, args_list)
+        results = pool.map(callable_f, batches)
 
     # Aggregate results
     left_primer_candidates = []
@@ -502,13 +507,12 @@ def parse_primer_fasta(fasta_path):
     return list(set(sequences))  # deduplication
 
 
-def check_primer_against_externals(args):
+def check_primer_against_externals(external_sequences, primer):
     """
     Worker function to check a single primer against all external sequences.
     Returns the primer if it passes, None otherwise.
     Handles both list format and dict format (name, data) tuples.
     """
-    primer, external_sequences = args
 
     # Extract sequence based on input format
     if isinstance(primer, tuple):
@@ -524,21 +528,24 @@ def check_primer_against_externals(args):
     return primer
 
 
-def filter_non_dimer_candidates(primer_candidates, external_sequences, n_threads):
+def filter_non_dimer_candidates(primer_candidates, external_sequences, n_processes):
     """
     Filter out primer candidates that form dimers with external sequences.
     Uses multiprocessing to speed up checks.
     """
     is_dict = isinstance(primer_candidates, dict)
 
-    with multiprocessing.Pool(processes=n_threads) as pool:
+    callable_f = functools.partial(
+        check_primer_against_externals,
+        external_sequences
+    )
+
+    with multiprocessing.Pool(processes=n_processes) as pool:
         # Prepare arguments based on input type
         if is_dict:
-            args = [((name, data), external_sequences) for name, data in primer_candidates.items()]
+            results = pool.map(callable_f, primer_candidates.items())
         else:
-            args = [(primer, external_sequences) for primer in primer_candidates]
-
-        results = pool.map(check_primer_against_externals, args)
+            results = pool.map(callable_f, primer_candidates)
 
     # Filter and restore original format
     if is_dict:
diff --git a/varvamp/scripts/qpcr.py b/varvamp/scripts/qpcr.py
@@ -7,11 +7,11 @@
 import seqfold
 import itertools
 import multiprocessing
+import functools
 
 # varVAMP
 from varvamp.scripts import config
 from varvamp.scripts import primers
-from varvamp.scripts import reporting
 
 
 def choose_probe_direction(seq):
@@ -51,12 +51,11 @@ def filter_probe_direction_dependent(seq):
     )
 
 
-def _process_kmer_batch_probes(args):
+def _process_kmer_batch_probes(ambiguous_consensus, alignment_cleaned, kmers):
     """
     Helper function for multiprocessing: process a batch of kmers for probes.
     Returns probe_candidates dictionary.
     """
-    kmers, ambiguous_consensus, alignment_cleaned = args
     probe_candidates = {}
     probe_idx = 0
 
@@ -91,7 +90,7 @@ def _process_kmer_batch_probes(args):
     return probe_candidates
 
 
-def get_qpcr_probes(kmers, ambiguous_consensus, alignment_cleaned, num_processes, batch_size=1000):
+def get_qpcr_probes(kmers, ambiguous_consensus, alignment_cleaned, num_processes):
     """
     Find potential qPCR probes using multiprocessing.
     """
@@ -100,12 +99,16 @@ def get_qpcr_probes(kmers, ambiguous_consensus, alignment_cleaned, num_processes
     kmers = list(kmers)
 
     # Split kmers into batches
+    batch_size = int(len(kmers) / num_processes)
     batches = [kmers[i:i + batch_size] for i in range(0, len(kmers), batch_size)]
-    args_list = [(batch, ambiguous_consensus, alignment_cleaned) for batch in batches]
 
-    # Process batches in parallel
+    # Prepare arguments for each dimer
+    callable_f = functools.partial(
+        _process_kmer_batch_probes,
+        ambiguous_consensus, alignment_cleaned
+    )
     with multiprocessing.Pool(processes=num_processes) as pool:
-        results = pool.map(_process_kmer_batch_probes, args_list)
+        results = pool.map(callable_f, batches)
 
     # Aggregate results and re-index probe names
     probe_candidates = {}
@@ -245,11 +248,14 @@ def assess_amplicons(left_subset, right_subset, qpcr_probes, probe, majority_con
     return primer_combinations
 
 
-def find_single_qpcr_scheme(probe_name, probe_data, left_primer_candidates, right_primer_candidates, qpcr_probes,
-                            majority_consensus, ambiguous_consensus):
+def find_single_qpcr_scheme(left_primer_candidates, right_primer_candidates, qpcr_probes,
+                            majority_consensus, ambiguous_consensus, probe):
     """
     Find a qPCR scheme for a single probe.
     """
+
+    probe_name, probe_data = probe
+
     # Generate flanking subsets within the worker process
     left_subset = flanking_primer_subset(left_primer_candidates, "+", probe_data)
     right_subset = flanking_primer_subset(right_primer_candidates, "-", probe_data)
@@ -266,7 +272,7 @@ def find_single_qpcr_scheme(probe_name, probe_data, left_primer_candidates, righ
 
 
 def find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidates,
-                     majority_consensus, ambiguous_consensus, num_processes, batch_size=100):
+                     majority_consensus, ambiguous_consensus, num_processes):
     """
     Find final qPCR schemes using multiprocessing to evaluate probes in parallel.
     Probes are sorted by penalty, ensuring optimal probe selection.
@@ -276,15 +282,15 @@ def find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidate
     amplicon_nr = -1
 
     # Prepare arguments for parallel processing - pass full primer lists
-    args_list = [
-        (probe_name, probe_data, left_primer_candidates, right_primer_candidates,
-         qpcr_probes, majority_consensus, ambiguous_consensus)
-        for probe_name, probe_data in qpcr_probes.items()
-    ]
+    batch_size = int(len(qpcr_probes) / num_processes)
+    callable_f = functools.partial(
+        find_single_qpcr_scheme,
+        left_primer_candidates, right_primer_candidates, qpcr_probes, majority_consensus, ambiguous_consensus
+    )
 
     # Process probes in parallel
     with multiprocessing.Pool(processes=num_processes) as pool:
-        results = pool.map(find_single_qpcr_scheme, args_list, chunksize=batch_size)
+        results = pool.map(callable_f, qpcr_probes.items(), chunksize=batch_size)
 
     # Aggregate results in original probe order (sorted by penalty)
     for probe_name, primer_combination in results:
@@ -327,7 +333,7 @@ def process_single_amplicon_deltaG(amplicon, majority_consensus):
     return amplicon
 
 
-def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n_to_test, deltaG_cutoff, n_threads):
+def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n_to_test, deltaG_cutoff, n_processes):
     """
     Test all amplicon deltaGs for the top n hits at the lowest primer temperature
     and filters if they fall below the cutoff. Multiple processes are used
@@ -336,7 +342,7 @@ def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n
     final_amplicons = []
 
     # Create a pool of processes to handle the concurrent processing
-    with multiprocessing.Pool(processes=n_threads) as pool:
+    with multiprocessing.Pool(processes=n_processes) as pool:
         # Create a list of the first n amplicon tuples for processing
         # The list is sorted first on whether offset targets were predicted for the amplicon,
         # then by penalty. This ensures that amplicons with offset targets are always considered last
diff --git a/varvamp/scripts/reporting.py b/varvamp/scripts/reporting.py
@@ -209,7 +209,7 @@ def write_qpcr_to_files(path, final_schemes, ambiguous_consensus, scheme_name, l
                 print(f">{primer_name}\n{seq.upper()}", file=fasta)
 
 
-def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_name, mode, log_file):
+def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_name, mode, log_file, primer_dimers=None):
     """
     write all relevant bed files and a tsv file with all primer stats
     """
@@ -218,6 +218,9 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
     amplicon_bed_file = os.path.join(path, "amplicons.bed")
     tabular_file = os.path.join(path, "primer_to_amplicon_assignment.tabular")
 
+    # Map old primer names to new amplicon-based names
+    name_mapping = {}
+
     # open files to write
     with open(tsv_file, "w") as tsv, open(amplicon_bed_file, "w") as bed, open(tabular_file, "w") as tabular:
         # write header for primer tsv
@@ -233,11 +236,11 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
             if mode == "single":
                 primer_fasta_file = os.path.join(path, "primers.fasta")
             else:
-                primer_fasta_file = os.path.join(path, f"primers_pool_{pool+1}.fasta")
+                primer_fasta_file = os.path.join(path, f"primers_pool_{pool + 1}.fasta")
             with open(primer_fasta_file, "w") as primer_fasta:
                 for counter, amp in enumerate(amplicon_scheme[pool::len(pools)]):
                     # give a new amplicon name
-                    amplicon_index = counter*len(pools) + pool
+                    amplicon_index = counter * len(pools) + pool
                     amp_name = f"{scheme_name}_{amplicon_index}"
                     # get left and right primers and their names
                     amp_length = amp["RIGHT"][2] - amp["LEFT"][1]
@@ -251,7 +254,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                         amplicon_has_off_target = "n.d."
                     # write amplicon bed
                     if mode == "tiled":
-                        bed_score = pool+1
+                        bed_score = pool + 1
                     elif mode == "single":
                         bed_score = round(amp["LEFT"][3] + amp["RIGHT"][3], 1)
                     amplicon_bed_records.append(
@@ -269,6 +272,10 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                             (f"{amp_name}_LEFT", f"{amp_name}_RIGHT")
                         )
                     )
+                    # Build name mapping for dimers
+                    name_mapping[amp["LEFT"][-1]] = f"{amp_name}_LEFT"
+                    name_mapping[amp["RIGHT"][-1]] = f"{amp_name}_RIGHT"
+
                     # write primer tsv and primer bed
                     for direction, primer in [("+", amp["LEFT"]), ("-", amp["RIGHT"])]:
                         seq = ambiguous_consensus[primer[1]:primer[2]]
@@ -288,7 +295,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                             amp_length,
                             primer_name,
                             primer[-1],
-                            pool+1,
+                            pool + 1,
                             primer[1] + 1,
                             primer[2],
                             seq.upper(),
@@ -306,7 +313,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                             (
                                 # will need amplicon_index for sorting
                                 amplicon_index,
-                                (primer_name, primer, pool+1, direction, seq.upper())
+                                (primer_name, primer, pool + 1, direction, seq.upper())
                             )
                         )
         # write amplicon bed with amplicons sorted by start position
@@ -333,8 +340,12 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                 *record[1]
             )
 
+    # Write dimers with renamed primers
+    if primer_dimers:
+        write_dimers(path, primer_dimers, name_mapping)
+
 
-def write_dimers(path, primer_dimers):
+def write_dimers(path, primer_dimers, name_mapping):
     """
     write dimers for which no replacement was found to file
     """
@@ -348,8 +359,8 @@ def write_dimers(path, primer_dimers):
             )
             print(
                 pool+1,
-                primer1[2][0],
-                primer2[2][0],
+                name_mapping[primer1[1]],
+                name_mapping[primer2[1]],
                 round(dimer_result.tm, 1),
                 dimer_result.dg,
                 sep="\t",
diff --git a/varvamp/scripts/scheme.py b/varvamp/scripts/scheme.py