Skip to content

Commit 714834a

Browse files
authored
Merge branch 'master' into scheme_compatibility
2 parents 7f3b28b + 9c9f99f commit 714834a

File tree

5 files changed

+34
-17
lines changed

5 files changed

+34
-17
lines changed

docs/how_varvamp_works.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@ varVAMP searches for potential primer regions as defined by a user-defined numbe
2323
varVAMP uses [`primer3-py`](https://pypi.org/project/primer3-py/) to search for potential primers. Some of the evaluation process, determining if primers match certain criteria, was adapted from [`primalscheme`](https://github.com/aresti/primalscheme). The primer search contains multiple steps:
2424
1. Digest the primer regions into kmers with the min and max length of primers. This is performed on a consensus sequence that does not contain ambiguous characters but is just the majority consensus of the alignment. Therefore, primer parameters will be later calculated for the best fitting primer.
2525
2. Evaluate if these kmers are potential primers independent of their orientation (temperature, GC, size, poly-x repeats and poly dinucleotide repeats) and dependent on their orientation (secondary structure, GC clamp, number of GCs in the last 5 bases of the 3' end and min 3' nucleotides without an ambiguous base). Filter for kmers that satisfy all constraints and calculate their penalties (explained in the last section).
26-
3. Single and tiled mode: Find primer with the lowest penalty. varVAMP sorts the primers by their penalty and always takes one with the lowest penalty if middle third of the primer has not been covered by a primer with a lower penalty. This greatly reduces the complexity of the later amplicon search while only retaining the best primer of a set of overlapping primers.
27-
26+
3. Single and tiled mode: Find primer with the lowest penalty. varVAMP sorts the primers by their penalty and always takes one with the lowest penalty if middle third of the primer has not been covered by a primer with a lower penalty. This greatly reduces the complexity of the later amplicon search while only retaining the best primer of a set of overlapping primers. If the percentage of potential primer regions exceeds 90% of the genome, varVAMP switches to a stringent mode and excludes all overlapping primers to again reduce complexity.
2827
### Amplicon search
2928

3029
#### Amplicon-tiling

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = 'varvamp'
7-
version = '1.2.3'
7+
version = '1.3'
88
description = 'Variable VirusAMPlicons (varVAMP) is a tool to design primers for highly diverse viruses.'
99
keywords = ['pcr, tiled pcr, primer-tiling, qpcr, primer design']
1010
dependencies = ["biopython>=1.79", "matplotlib>=3.5.1", "primer3-py>=1.1.0", "pandas>=1.4.4", "numpy>=1.23.3", "seqfold>=0.7.15"]

varvamp/command.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ def shared_workflow(args, log_file):
256256
ambiguous_consensus,
257257
args.n_ambig
258258
)
259+
260+
potential_primer_regions = regions.mean(primer_regions, majority_consensus)
261+
259262
if not primer_regions:
260263
logging.raise_error(
261264
"no primer regions found. Lower the threshold!",
@@ -266,7 +269,7 @@ def shared_workflow(args, log_file):
266269
log_file,
267270
progress=0.4,
268271
job="Finding primer regions.",
269-
progress_text=f"{regions.mean(primer_regions, majority_consensus)} % of the consensus sequence will be evaluated for primers"
272+
progress_text=f"{potential_primer_regions} % of the consensus sequence will be evaluated for primers"
270273
)
271274

272275
# produce kmers for all primer regions
@@ -316,20 +319,26 @@ def shared_workflow(args, log_file):
316319
progress_text=f"{len(left_primer_candidates)} fw and {len(right_primer_candidates)} rv primers after filtering"
317320
)
318321

319-
return alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, compatible_primers
322+
return alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, potential_primer_regions, compatible_primers
320323

321324

322-
def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_candidates, data_dir, log_file):
325+
def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_candidates, potential_primer_regions, data_dir, log_file):
323326
"""
324327
part of the workflow shared by the single and tiled mode
325328
"""
326329

327330
# find best primers and create primer dict
328-
all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates)
331+
# depending on the percentage of potential primer regions use high conservation mode
332+
if potential_primer_regions >= 90:
333+
all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation=True)
334+
job_text = "Excluding overlapping primers (stringent)."
335+
else:
336+
all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation=False)
337+
job_text = "Excluding overlapping primers."
329338
logging.varvamp_progress(
330339
log_file,
331340
progress=0.7,
332-
job="Considering primers with low penalties.",
341+
job=f"{job_text}",
333342
progress_text=f"{len(all_primers['+'])} fw and {len(all_primers['-'])} rv primers"
334343
)
335344

@@ -544,7 +553,7 @@ def main():
544553
blast.check_BLAST_installation(log_file)
545554

546555
# mode unspecific part of the workflow
547-
alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, compatible_primers = shared_workflow(args, log_file)
556+
alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, potential_primer_regions, compatible_primers = shared_workflow(args, log_file)
548557

549558
# write files that are shared in all modes
550559
reporting.write_regions_to_bed(primer_regions, args.name, data_dir)
@@ -570,6 +579,7 @@ def main():
570579
args,
571580
left_primer_candidates,
572581
right_primer_candidates,
582+
potential_primer_regions,
573583
data_dir,
574584
log_file
575585
)

varvamp/scripts/param_estimation.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,11 @@ def get_parameters(preprocessed_alignment, args, log_file):
114114
# check if the distance is acceptable
115115
distance_threshold = args.opt_length - 2 * args.overlap if args.mode == 'tiled' else args.opt_length
116116
if max_distance_between_passing < distance_threshold:
117-
args.threshold += 0.01
117+
# never exceed 0.99
118+
if args.threshold < 0.99:
119+
args.threshold += 0.01
120+
else:
121+
break
118122
# or reset to the param of the two previous iterations
119123
else:
120124
args.threshold -= 0.02

varvamp/scripts/primers.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,7 @@ def create_primer_dictionary(primer_candidates, direction):
463463
return primer_dict
464464

465465

466-
def find_best_primers(left_primer_candidates, right_primer_candidates):
466+
def find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation:bool=False):
467467
"""
468468
Primer candidates are likely overlapping. Here, the list of primers
469469
is sorted for the lowest to highest penalty. Then, the next lowest
@@ -489,16 +489,20 @@ def find_best_primers(left_primer_candidates, right_primer_candidates):
489489
primer_candidates.sort(key=lambda x: (x[3], x[1]))
490490
# ini everything with the primer with the lowest penalty
491491
to_retain = [primer_candidates[0]]
492-
primer_ranges = list(range(primer_candidates[0][1], primer_candidates[0][2]))
493-
primer_set = set(primer_ranges)
492+
primer_set = set(range(primer_candidates[0][1], primer_candidates[0][2]))
494493

495-
for primer in primer_candidates:
494+
for primer in primer_candidates[1:]:
495+
# for highly conserved alignments exclude everything that overlaps with the best primer
496+
# this reduces graph complexity by quite a large margin
497+
if high_conservation:
498+
primer_positions =set(range(primer[1], primer[2]))
496499
# get the thirds of the primer, only consider the middle
497-
thirds_len = int((primer[2] - primer[1])/3)
498-
primer_positions = list(range(primer[1] + thirds_len, primer[2] - thirds_len))
500+
else:
501+
thirds_len = int((primer[2] - primer[1])/3)
502+
primer_positions = set(range(primer[1] + thirds_len, primer[2] - thirds_len))
499503
# check if none of the nucleotides of the next primer
500504
# are already covered by a better primer
501-
if not any(x in primer_positions for x in primer_set):
505+
if primer_set.isdisjoint(primer_positions):
502506
# update the primer set
503507
primer_set.update(primer_positions)
504508
# append this primer as it has a low penalty and is not overlapping

0 commit comments

Comments
 (0)