From 82b924c55abbdba18ff230c6e877c62da9ff3493 Mon Sep 17 00:00:00 2001 From: areebamomin Date: Wed, 10 Dec 2025 16:27:44 -0500 Subject: [PATCH 01/10] continuing to implement feature, vibe coded and fixed some, still need to check tests Add recombinase assembly algorithm for attB/attP --- src/pydna/assembly2.py | 70 ++++++++ src/pydna/opencloning_models.py | 4 + src/pydna/recombinase.py | 251 +++++++++++++++++++++++++++ tests/test_module_recombinase.py | 288 +++++++++++++++++++++++++++++++ 4 files changed, 613 insertions(+) create mode 100644 src/pydna/recombinase.py create mode 100644 tests/test_module_recombinase.py diff --git a/src/pydna/assembly2.py b/src/pydna/assembly2.py index d62f8fb7..85b75005 100644 --- a/src/pydna/assembly2.py +++ b/src/pydna/assembly2.py @@ -39,6 +39,7 @@ from pydna.gateway import gateway_overlap, find_gateway_sites from pydna.cre_lox import cre_loxP_overlap from pydna.alphabet import anneal_strands +from pydna.recombinase import Recombinase from typing import TYPE_CHECKING, Callable, Literal from pydna.opencloning_models import ( @@ -52,6 +53,7 @@ GatewaySource, HomologousRecombinationSource, CreLoxRecombinationSource, + RecombinaseSource, PCRSource, SourceInput, CRISPRSource, @@ -2801,6 +2803,74 @@ def cre_lox_excision(genome: Dseqrecord) -> list[Dseqrecord]: return _recast_sources(products, CreLoxRecombinationSource) +def recombinase_excision( + genome: Dseqrecord, + site1: str, + site2: str, +) -> list[Dseqrecord]: + """Returns the products for recombinase-mediated excision. + + Parameters + ---------- + genome : Dseqrecord + Target genome sequence containing two recombinase sites. + site1 : str + First recombinase recognition site (lowercase = overlap core). + site2 : str + Second recombinase recognition site (lowercase = overlap core). + + Returns + ------- + list[Dseqrecord] + List containing excised plasmid and remaining genome sequence. + """ + rec = Recombinase(site1, site2) + products = common_function_excision_products(genome, None, rec.overlap) + return _recast_sources(products, RecombinaseSource) + + +def recombinase_integration( + genome: Dseqrecord, + inserts: list[Dseqrecord], + site1: str, + site2: str, +) -> list[Dseqrecord]: + """Returns the products resulting from recombinase-mediated integration. + + Parameters + ---------- + genome : Dseqrecord + Target genome sequence. + inserts : list[Dseqrecord] + DNA fragment(s) to insert. + site1 : str + First recombinase recognition site (lowercase = overlap core). + site2 : str + Second recombinase recognition site (lowercase = overlap core). + + Returns + ------- + list[Dseqrecord] + List of integrated DNA molecules. + + Examples + -------- + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.assembly2 import recombinase_integration, recombinase_excision + >>> site1 = "ATGCCCTAAaaTT" + >>> site2 = "AAaaTTTTTTTCCCT" + >>> genome = Dseqrecord(f"cccccc{site1.upper()}aaaaa") + >>> insert = Dseqrecord(f"{site2.upper()}bbbbb", circular=True) + >>> products = recombinase_integration(genome, [insert], site1, site2) + >>> len(products) >= 1 + True + """ + fragments = common_handle_insertion_fragments(genome, inserts) + rec = Recombinase(site1, site2) + products = common_function_integration_products(fragments, None, rec.overlap) + return _recast_sources(products, RecombinaseSource) + + def crispr_integration( genome: Dseqrecord, inserts: list[Dseqrecord], diff --git a/src/pydna/opencloning_models.py b/src/pydna/opencloning_models.py index 1353a763..9efc13ab 100644 --- a/src/pydna/opencloning_models.py +++ b/src/pydna/opencloning_models.py @@ -524,6 +524,10 @@ class CreLoxRecombinationSource(AssemblySource): ) +class RecombinaseSource(AssemblySource): + pass + + class PCRSource(AssemblySource): TARGET_MODEL: ClassVar[Type[_PCRSource]] = _PCRSource add_primer_features: bool = Field(default=False) diff --git a/src/pydna/recombinase.py b/src/pydna/recombinase.py new file mode 100644 index 00000000..4436dee2 --- /dev/null +++ b/src/pydna/recombinase.py @@ -0,0 +1,251 @@ +# -*- coding: utf-8 -*- +""" +Generic site-specific recombinase functionality. + +This module provides tools for simulating site-specific recombination reactions +mediated by recombinases (e.g. phage integrases such as phiC31, Bxb1, etc.). + +Recombinase sites are specified as strings where the **lowercase** portion +indicates the homology/overlap core shared between the two sites. The +uppercase portion represents the flanking recognition arms. + +For example:: + + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + +The lowercase ``aa`` is the overlap that will appear in the assembled product. + +Sites may contain IUPAC degenerate bases (N, W, S, etc.) and the search +handles both linear and circular sequences. +""" + +import re +import itertools as _itertools + +from Bio.Seq import reverse_complement +from Bio.SeqFeature import SimpleLocation, SeqFeature + +from pydna.dseqrecord import Dseqrecord as _Dseqrecord +from pydna.utils import shift_location +from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer +from pydna.types import SequenceOverlap + + +def _recombinase_homology_offset_and_length(site: str) -> tuple[int, int]: + """ + Return (offset, length) of the lowercase homology region inside a + recombinase recognition site string. + + The lowercase segment represents the shared homology between the two + recombinase sites. + + Parameters + ---------- + site : str + The recognition site sequence, with the homology region in lowercase. + Must contain only letters (no digits or symbols). + + Returns + ------- + offset : int + Index of the first lowercase character in the site. + length : int + Length of the contiguous lowercase homology region. + + Raises + ------ + ValueError + If the site contains invalid characters or no lowercase region. + """ + if not re.fullmatch(r"[A-Z]+[a-z]+[A-Z]+", site): + raise ValueError( + "Recombinase recognition site is not in the expected format." + "Expected format: [A-Z]+[a-z]+[A-Z]+, e.g. 'ATGCCCTAAaaTT'" + ) + match = re.search(r"[a-z]+", site) + return match.start(), len(match.group()) + + +class Recombinase: + """A site-specific recombinase defined by two recognition sites. Homology cores must match. + + Parameters + ---------- + site1 : str + The first recognition site. The homology core must be in lowercase. + site2 : str + The second recognition site. The homology core must be in lowercase. + + Examples + -------- + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase + >>> rec = Recombinase("ATGCCCTAAaaTT", "AAaaTTTTTTTCCCT") + >>> seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + >>> seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + >>> rec.overlap(seqA, seqB) + [(12, 6, 2)] + """ + + def __init__(self, site1: str, site2: str): + self.site1 = site1 + self.site2 = site2 + + off1, len1 = _recombinase_homology_offset_and_length(site1) + off2, len2 = _recombinase_homology_offset_and_length(site2) + if site1[off1 : off1 + len1] != site2[off2 : off2 + len2]: + raise ValueError( + "Recombinase recognition sites do not have matching homology cores." + f"Expected {site1[off1:off1 + len1]} == {site2[off2:off2 + len2]}" + ) + self._homology_len = len1 + + rc1 = reverse_complement(site1) + rc2 = reverse_complement(site2) + off1_rev, _ = _recombinase_homology_offset_and_length(rc1) + off2_rev, _ = _recombinase_homology_offset_and_length(rc2) + + self._site1_fwd = compute_regex_site(site1) + self._site1_rev = compute_regex_site(rc1) + self._site2_fwd = compute_regex_site(site2) + self._site2_rev = compute_regex_site(rc2) + + self._configs = [ + (self._site1_fwd, self._site2_fwd, off1, off2), + (self._site1_rev, self._site2_rev, off1_rev, off2_rev), + ] + + def overlap( + self, + seqx: _Dseqrecord, + seqy: _Dseqrecord, + limit: None = None, + ) -> list[SequenceOverlap]: + """Find overlaps between *seqx* and *seqy* mediated by the sites. + + Parameters + ---------- + seqx : Dseqrecord + The first sequence. + seqy : Dseqrecord + The second sequence. + limit : None + Not used. Present only to satisfy the + ``AssemblyAlgorithmType`` convention. + + Returns + ------- + list[SequenceOverlap] + A list of ``(start_in_x, start_in_y, length)`` tuples. + """ + matches: list[SequenceOverlap] = [] + + for site_x_re, site_y_re, off_x, off_y in self._configs: + matches_x = list(dseqrecord_finditer(site_x_re, seqx)) + if not matches_x: + continue + matches_y = list(dseqrecord_finditer(site_y_re, seqy)) + if not matches_y: + continue + + for mx, my in _itertools.product(matches_x, matches_y): + # Here we wrap in case the sequence is circular and the match spans the origin, + # and the offset makes the match go beyond the origin. + matches.append( + ( + (mx.start() + off_x) % len(seqx), + (my.start() + off_y) % len(seqy), + self._homology_len, + ) + ) + + # Deduplicate while preserving order + seen: set[SequenceOverlap] = set() + unique: list[SequenceOverlap] = [] + for m in matches: + if m not in seen: + seen.add(m) + unique.append(m) + return unique + + +def find_recombinase_sites( + seq: _Dseqrecord, + site1: str, + site2: str, + site1_name: str = "site1", + site2_name: str = "site2", +) -> dict[str, list[SimpleLocation]]: + """Find all occurrences of the recombinase sites in *seq*. + + Parameters + ---------- + seq : Dseqrecord + Sequence to search. + site1 : str + First recognition site (may contain degenerate bases). + site2 : str + Second recognition site (may contain degenerate bases). + site1_name : str + Label for site1 in the returned dictionary. + site2_name : str + Label for site2 in the returned dictionary. + + Returns + ------- + dict[str, list[SimpleLocation]] + Dictionary mapping site names to lists of locations. + """ + out: dict[str, list[SimpleLocation]] = {} + for name, raw_site in [(site1_name, site1), (site2_name, site2)]: + fwd_re = compute_regex_site(raw_site) + rev_re = compute_regex_site(reverse_complement(raw_site)) + for pattern, strand in [(fwd_re, 1), (rev_re, -1)]: + for match in dseqrecord_finditer(pattern, seq): + loc = SimpleLocation(match.start(), match.end(), strand) + loc = shift_location(loc, 0, len(seq)) + out.setdefault(name, []).append(loc) + return out + + +def annotate_recombinase_sites( + seq: _Dseqrecord, + site1: str, + site2: str, + site1_name: str = "site1", + site2_name: str = "site2", +) -> _Dseqrecord: + """Annotate *seq* with features for all occurrences of the recombinase sites. + + Parameters + ---------- + seq : Dseqrecord + Sequence to annotate (modified in place and returned). + site1 : str + First recognition site. + site2 : str + Second recognition site. + site1_name : str + Label for site1 features. + site2_name : str + Label for site2 features. + + Returns + ------- + Dseqrecord + The same sequence with added features. + """ + sites = find_recombinase_sites(seq, site1, site2, site1_name, site2_name) + for name, locs in sites.items(): + for loc in locs: + if not any( + f.location == loc + and f.type == "protein_bind" + and f.qualifiers.get("label", []) == [name] + for f in seq.features + ): + seq.features.append( + SeqFeature(loc, type="protein_bind", qualifiers={"label": [name]}) + ) + return seq diff --git a/tests/test_module_recombinase.py b/tests/test_module_recombinase.py new file mode 100644 index 00000000..f1c462c6 --- /dev/null +++ b/tests/test_module_recombinase.py @@ -0,0 +1,288 @@ +import pytest + +from pydna.dseqrecord import Dseqrecord +from pydna.assembly2 import ( + Assembly, + recombinase_integration, + recombinase_excision, +) +from pydna.recombinase import ( + _recombinase_homology_offset_and_length, + Recombinase, + find_recombinase_sites, + annotate_recombinase_sites, +) + + +# --------------------------------------------------------------------------- +# _recombinase_homology_offset_and_length +# --------------------------------------------------------------------------- + + +def test_recombinase_homology_offset_and_length_basic(): + site1 = "ATGCCCTAAaaTT" # lowercase 'aa' at positions 9-10 + site2 = "AAaaTTTTTTTCCCT" # lowercase 'aa' at positions 2-3 + + off1, len1 = _recombinase_homology_offset_and_length(site1) + off2, len2 = _recombinase_homology_offset_and_length(site2) + + assert (off1, len1) == (9, 2) + assert (off2, len2) == (2, 2) + + +def test_recombinase_homology_offset_and_length_wrong_pattern_raises(): + for pattern in ["ATGCCCTAAATT", "ATGaa", "aaATG", "ATGaaCCaa", "ATGaaCCaaTT"]: + with pytest.raises(ValueError): + _recombinase_homology_offset_and_length(pattern) + + +def test_recombinase_homology_offset_and_length_invalid_chars(): + with pytest.raises(ValueError): + _recombinase_homology_offset_and_length("ATGaa123") + with pytest.raises(ValueError): + _recombinase_homology_offset_and_length("ATGaa-CC") + + +# --------------------------------------------------------------------------- +# Recombinase.overlap – basic (linear, exact) +# --------------------------------------------------------------------------- + + +def test_recombinase_overlap_single_match(): + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + + rec = Recombinase(site1, site2) + matches = rec.overlap(seqA, seqB, limit=None) + assert matches == [(12, 6, 2)] + + +def test_recombinase_overlap_no_site_returns_empty(): + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + seqB = Dseqrecord("tataAAccTTTTTTTCCCTaaa") + + rec = Recombinase(site1, site2) + matches = rec.overlap(seqA, seqB, limit=None) + assert matches == [] + + +def test_recombinase_algorithm_with_assembly(): + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + + rec = Recombinase(site1, site2) + asm = Assembly([seqA, seqB], algorithm=rec.overlap, use_fragment_order=False) + products = asm.assemble_linear() + + assert len(products) == 2 + assert str(products[0].seq) == "aaaATGCCCTAAaaTTTTTTTCCCTaaa" + assert str(products[1].seq) == "tataAAaaTTtt" + + +# --------------------------------------------------------------------------- +# Circular sequences +# --------------------------------------------------------------------------- + + +def test_circular_site_spanning_origin(): + """Sites that span the origin of a circular sequence should be found.""" + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + circ_seq = Dseqrecord("aaTTgggggATGCCCTAA", circular=True) + linear_seq = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + + rec = Recombinase(site1, site2) + for shift in range(len(circ_seq)): + circ_seq_shifted = circ_seq.shifted(shift) + matches = rec.overlap(circ_seq_shifted, linear_seq, limit=None) + assert len(matches) == 1 + start, _, length = matches[0] + assert ( + str(circ_seq_shifted.seq[start : (start + length) % len(circ_seq_shifted)]) + == "aa" + ) + + +def test_circular_both_sequences(): + """Both sequences circular, sites should still be found.""" + site1 = "GGGaaaCCC" + site2 = "TTaaaTT" + + circ_x = Dseqrecord("aaaCCCcaGGG", circular=True) + circ_y = Dseqrecord("aaaTTgaTT", circular=True) + + rec = Recombinase(site1, site2) + matches = rec.overlap(circ_x, circ_y, limit=None) + assert len(matches) == 1 + assert matches[0] == (0, 0, 3) + + +# --------------------------------------------------------------------------- +# Degenerate sequences +# --------------------------------------------------------------------------- + + +def test_degenerate_site(): + """Sites with IUPAC ambiguity codes should match concrete sequences.""" + site1 = "ATGNNNaaTT" + site2 = "CCNNaaTTGG" + + seqA = Dseqrecord("xxxATGCCCaaTTyyy") + seqB = Dseqrecord("xxxCCGGaaTTGGyyy") + + rec = Recombinase(site1, site2) + matches = rec.overlap(seqA, seqB, limit=None) + assert len(matches) == 1 + assert matches[0] == (9, 7, 2) + + +# --------------------------------------------------------------------------- +# Reverse complement +# --------------------------------------------------------------------------- + +# TODO: check tests below + + +def test_reverse_complement_sites(): + """Sites on the reverse strand should be detected.""" + from Bio.Seq import reverse_complement + + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + rc_site1 = reverse_complement(site1) + seqA = Dseqrecord(f"ggg{rc_site1}ggg") + seqB = Dseqrecord(f"ttt{reverse_complement(site2)}ttt") + + rec = Recombinase(site1, site2) + matches = rec.overlap(seqA, seqB, limit=None) + assert len(matches) >= 1 + + +# --------------------------------------------------------------------------- +# Recombinase class +# --------------------------------------------------------------------------- + + +def test_recombinase_class_basic(): + rec = Recombinase("ATGCCCTAAaaTT", "AAaaTTTTTTTCCCT") + + seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + + matches = rec.overlap(seqA, seqB) + assert matches == [(12, 6, 2)] + + +# --------------------------------------------------------------------------- +# Site annotation +# --------------------------------------------------------------------------- + + +def test_find_recombinase_sites(): + site1 = "ATGCCCTAAaaTT" + seq = Dseqrecord(f"ggg{site1.upper()}ttt") + sites = find_recombinase_sites( + seq, site1, "GGGaaaCCC", site1_name="s1", site2_name="s2" + ) + assert "s1" in sites + assert len(sites["s1"]) >= 1 + + +def test_annotate_recombinase_sites(): + site1 = "ATGCCCTAAaaTT" + seq = Dseqrecord(f"ggg{site1.upper()}ttt") + assert len(seq.features) == 0 + annotate_recombinase_sites( + seq, site1, "GGGaaaCCC", site1_name="mysite1", site2_name="mysite2" + ) + assert len(seq.features) >= 1 + assert any(f.qualifiers.get("label", []) == ["mysite1"] for f in seq.features) + + # Calling again should not duplicate + n_before = len(seq.features) + annotate_recombinase_sites( + seq, site1, "GGGaaaCCC", site1_name="mysite1", site2_name="mysite2" + ) + assert len(seq.features) == n_before + + +# --------------------------------------------------------------------------- +# Symmetry: site1 in seqy and site2 in seqx should also work +# --------------------------------------------------------------------------- + + +def test_sites_swapped_between_sequences(): + """With site1 in seqx and site2 in seqy (can be either order in Assembly).""" + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + + rec = Recombinase(site1, site2) + matches = rec.overlap(seqA, seqB, limit=None) + assert len(matches) >= 1 + + +# --------------------------------------------------------------------------- +# recombinase_integration and recombinase_excision +# --------------------------------------------------------------------------- + + +def test_recombinase_integration(): + """Integration of insert into genome via recombinase sites.""" + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + genome = Dseqrecord(f"cccccc{site1.upper()}aaaaa") + insert = Dseqrecord(f"{site2.upper()}bbbbb", circular=True) + + products = recombinase_integration(genome, [insert], site1, site2) + assert len(products) >= 1 + # Integrated product should contain both flanking regions and insert + seq_str = str(products[0].seq).upper() + assert "ATGCCCTAA" in seq_str + assert "TTTTTTTCCCT" in seq_str + + +def test_recombinase_excision(): + """Excision of plasmid from genome with two recombinase sites.""" + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + genome = Dseqrecord( + f"cccccc{site1.upper()}xxxxx{site2.upper()}aaaaa", + circular=True, + ) + + products = recombinase_excision(genome, site1, site2) + assert len(products) >= 1 + + +def test_recombinase_integration_excision_reversibility(): + """With same site on both sides (Cre-Lox style), integrate then excise returns originals.""" + site = "ATGCCCTAAaaTT" + + genome = Dseqrecord(f"cccccc{site.upper()}aaaaa") + insert = Dseqrecord(f"{site.upper()}bbbbb", circular=True) + + products = recombinase_integration(genome, [insert], site, site) + assert len(products) == 1 + integrated = products[0] + + excised = recombinase_excision(integrated, site, site) + assert len(excised) == 2 + + seqs = [str(p.seq).upper() for p in excised] + assert any(site.upper() in s for s in seqs) From 850559ad0cfc5ee50c5dad7e97a9b236351aa839 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 25 Feb 2026 01:15:21 +0000 Subject: [PATCH 02/10] some more fixing, but excision not giving all products --- dummy.py | 25 +++++ src/pydna/assembly2.py | 34 +++---- src/pydna/recombinase.py | 155 +++++++++++++++---------------- tests/test_module_recombinase.py | 109 +++++++++------------- 4 files changed, 160 insertions(+), 163 deletions(-) create mode 100644 dummy.py diff --git a/dummy.py b/dummy.py new file mode 100644 index 00000000..c770d819 --- /dev/null +++ b/dummy.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +from pydna.dseqrecord import Dseqrecord +from pydna.assembly2 import recombinase_excision +from pydna.recombinase import Recombinase +from pydna.assembly2 import SingleFragmentAssembly + + +site1 = "ATGCCCTAAaaCT" +site2 = "AAaaTTTTTTTCCCT" + +rec = Recombinase(site1, site2) +genome = Dseqrecord(f"cccccc{site1.upper()}tttt{site2.upper()}aaaaa") + +products = recombinase_excision(genome, rec) + +asm = SingleFragmentAssembly([genome], None, rec.overlap) +print(*asm.G.edges(data=True), sep="\n") +print(asm.get_circular_assemblies()) +print() + +seq2 = Dseqrecord(f"ag{site1.upper()}gtca{site1.upper()}aa") + +asm = SingleFragmentAssembly([seq2], limit=13) +print(*asm.G.edges(data=True), sep="\n") +# print(asm.get_circular_assemblies()) diff --git a/src/pydna/assembly2.py b/src/pydna/assembly2.py index 85b75005..2c874dc3 100644 --- a/src/pydna/assembly2.py +++ b/src/pydna/assembly2.py @@ -1970,11 +1970,15 @@ def __init__(self, frags: [Dseqrecord], limit=25, algorithm=common_sub_strings): self.G.add_node(1, seq=frag) matches = algorithm(frag, frag, limit) + # Remove matches where the whole sequence matches + matches = [match for match in matches if match[2] != len(frag)] + for match in matches: self.add_edges_from_match(match, 1, 1, frag, frag) # To avoid duplicated outputs - self.G.remove_edges_from([(-1, -1)]) + while (-1, -1) in self.G.edges(): + self.G.remove_edges_from([(-1, -1)]) # These two are constrained self.use_fragment_order = True @@ -2805,8 +2809,7 @@ def cre_lox_excision(genome: Dseqrecord) -> list[Dseqrecord]: def recombinase_excision( genome: Dseqrecord, - site1: str, - site2: str, + recombinase: Recombinase, ) -> list[Dseqrecord]: """Returns the products for recombinase-mediated excision. @@ -2814,26 +2817,23 @@ def recombinase_excision( ---------- genome : Dseqrecord Target genome sequence containing two recombinase sites. - site1 : str - First recombinase recognition site (lowercase = overlap core). - site2 : str - Second recombinase recognition site (lowercase = overlap core). + recombinase : Recombinase + Recombinase object. Returns ------- list[Dseqrecord] List containing excised plasmid and remaining genome sequence. """ - rec = Recombinase(site1, site2) - products = common_function_excision_products(genome, None, rec.overlap) + products = common_function_excision_products(genome, None, recombinase.overlap) + products = [recombinase.annotate(p) for p in products] return _recast_sources(products, RecombinaseSource) def recombinase_integration( genome: Dseqrecord, inserts: list[Dseqrecord], - site1: str, - site2: str, + recombinase: Recombinase, ) -> list[Dseqrecord]: """Returns the products resulting from recombinase-mediated integration. @@ -2843,10 +2843,8 @@ def recombinase_integration( Target genome sequence. inserts : list[Dseqrecord] DNA fragment(s) to insert. - site1 : str - First recombinase recognition site (lowercase = overlap core). - site2 : str - Second recombinase recognition site (lowercase = overlap core). + recombinase : Recombinase + Recombinase object. Returns ------- @@ -2866,8 +2864,10 @@ def recombinase_integration( True """ fragments = common_handle_insertion_fragments(genome, inserts) - rec = Recombinase(site1, site2) - products = common_function_integration_products(fragments, None, rec.overlap) + products = common_function_integration_products( + fragments, None, recombinase.overlap + ) + products = [recombinase.annotate(p) for p in products] return _recast_sources(products, RecombinaseSource) diff --git a/src/pydna/recombinase.py b/src/pydna/recombinase.py index 4436dee2..3f2669eb 100644 --- a/src/pydna/recombinase.py +++ b/src/pydna/recombinase.py @@ -21,12 +21,13 @@ """ import re -import itertools as _itertools +import itertools +import copy from Bio.Seq import reverse_complement from Bio.SeqFeature import SimpleLocation, SeqFeature -from pydna.dseqrecord import Dseqrecord as _Dseqrecord +from pydna.dseqrecord import Dseqrecord from pydna.utils import shift_location from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer from pydna.types import SequenceOverlap @@ -76,6 +77,10 @@ class Recombinase: The first recognition site. The homology core must be in lowercase. site2 : str The second recognition site. The homology core must be in lowercase. + site1_name : str, optional + Label for site1 in find/annotate output. Default is "site1". + site2_name : str, optional + Label for site2 in find/annotate output. Default is "site2". Examples -------- @@ -86,11 +91,21 @@ class Recombinase: >>> seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") >>> rec.overlap(seqA, seqB) [(12, 6, 2)] + >>> sites = rec.find(seqA) + >>> rec.annotate(seqA) """ - def __init__(self, site1: str, site2: str): + def __init__( + self, + site1: str, + site2: str, + site1_name: str = "site1", + site2_name: str = "site2", + ): self.site1 = site1 self.site2 = site2 + self.site1_name = site1_name + self.site2_name = site2_name off1, len1 = _recombinase_homology_offset_and_length(site1) off2, len2 = _recombinase_homology_offset_and_length(site2) @@ -118,8 +133,8 @@ def __init__(self, site1: str, site2: str): def overlap( self, - seqx: _Dseqrecord, - seqy: _Dseqrecord, + seqx: Dseqrecord, + seqy: Dseqrecord, limit: None = None, ) -> list[SequenceOverlap]: """Find overlaps between *seqx* and *seqy* mediated by the sites. @@ -149,7 +164,7 @@ def overlap( if not matches_y: continue - for mx, my in _itertools.product(matches_x, matches_y): + for mx, my in itertools.product(matches_x, matches_y): # Here we wrap in case the sequence is circular and the match spans the origin, # and the offset makes the match go beyond the origin. matches.append( @@ -169,83 +184,59 @@ def overlap( unique.append(m) return unique + def find(self, seq: Dseqrecord) -> dict[str, list[SimpleLocation]]: + """Find all occurrences of the recombinase sites in *seq*. -def find_recombinase_sites( - seq: _Dseqrecord, - site1: str, - site2: str, - site1_name: str = "site1", - site2_name: str = "site2", -) -> dict[str, list[SimpleLocation]]: - """Find all occurrences of the recombinase sites in *seq*. - - Parameters - ---------- - seq : Dseqrecord - Sequence to search. - site1 : str - First recognition site (may contain degenerate bases). - site2 : str - Second recognition site (may contain degenerate bases). - site1_name : str - Label for site1 in the returned dictionary. - site2_name : str - Label for site2 in the returned dictionary. + Parameters + ---------- + seq : Dseqrecord + Sequence to search. - Returns - ------- - dict[str, list[SimpleLocation]] - Dictionary mapping site names to lists of locations. - """ - out: dict[str, list[SimpleLocation]] = {} - for name, raw_site in [(site1_name, site1), (site2_name, site2)]: - fwd_re = compute_regex_site(raw_site) - rev_re = compute_regex_site(reverse_complement(raw_site)) - for pattern, strand in [(fwd_re, 1), (rev_re, -1)]: - for match in dseqrecord_finditer(pattern, seq): - loc = SimpleLocation(match.start(), match.end(), strand) - loc = shift_location(loc, 0, len(seq)) - out.setdefault(name, []).append(loc) - return out - - -def annotate_recombinase_sites( - seq: _Dseqrecord, - site1: str, - site2: str, - site1_name: str = "site1", - site2_name: str = "site2", -) -> _Dseqrecord: - """Annotate *seq* with features for all occurrences of the recombinase sites. + Returns + ------- + dict[str, list[SimpleLocation]] + Dictionary mapping site names to lists of locations. + """ + out: dict[str, list[SimpleLocation]] = {} + for name, raw_site in [ + (self.site1_name, self.site1), + (self.site2_name, self.site2), + ]: + fwd_re = compute_regex_site(raw_site) + rev_re = compute_regex_site(reverse_complement(raw_site)) + for pattern, strand in [(fwd_re, 1), (rev_re, -1)]: + for match in dseqrecord_finditer(pattern, seq): + loc = SimpleLocation(match.start(), match.end(), strand) + loc = shift_location(loc, 0, len(seq)) + out.setdefault(name, []).append(loc) + return out + + def annotate(self, seq: Dseqrecord) -> Dseqrecord: + """Annotate *seq* with features for all occurrences of the recombinase sites. - Parameters - ---------- - seq : Dseqrecord - Sequence to annotate (modified in place and returned). - site1 : str - First recognition site. - site2 : str - Second recognition site. - site1_name : str - Label for site1 features. - site2_name : str - Label for site2 features. + Parameters + ---------- + seq : Dseqrecord + Sequence to annotate (modified in place and returned). - Returns - ------- - Dseqrecord - The same sequence with added features. - """ - sites = find_recombinase_sites(seq, site1, site2, site1_name, site2_name) - for name, locs in sites.items(): - for loc in locs: - if not any( - f.location == loc - and f.type == "protein_bind" - and f.qualifiers.get("label", []) == [name] - for f in seq.features - ): - seq.features.append( - SeqFeature(loc, type="protein_bind", qualifiers={"label": [name]}) - ) - return seq + Returns + ------- + Dseqrecord + The same sequence with added features. + """ + out_seq = copy.deepcopy(seq) + sites = self.find(out_seq) + for name, locs in sites.items(): + for loc in locs: + if not any( + f.location == loc + and f.type == "protein_bind" + and f.qualifiers.get("label", []) == [name] + for f in out_seq.features + ): + out_seq.features.append( + SeqFeature( + loc, type="protein_bind", qualifiers={"label": [name]} + ) + ) + return out_seq diff --git a/tests/test_module_recombinase.py b/tests/test_module_recombinase.py index f1c462c6..b0aaf236 100644 --- a/tests/test_module_recombinase.py +++ b/tests/test_module_recombinase.py @@ -1,5 +1,6 @@ +# -*- coding: utf-8 -*- import pytest - +from Bio.SeqFeature import SimpleLocation from pydna.dseqrecord import Dseqrecord from pydna.assembly2 import ( Assembly, @@ -9,8 +10,6 @@ from pydna.recombinase import ( _recombinase_homology_offset_and_length, Recombinase, - find_recombinase_sites, - annotate_recombinase_sites, ) @@ -161,27 +160,14 @@ def test_reverse_complement_sites(): site2 = "AAaaTTTTTTTCCCT" rc_site1 = reverse_complement(site1) + rc_site2 = reverse_complement(site2) seqA = Dseqrecord(f"ggg{rc_site1}ggg") - seqB = Dseqrecord(f"ttt{reverse_complement(site2)}ttt") + seqB = Dseqrecord(f"ttt{rc_site2}ttt") rec = Recombinase(site1, site2) matches = rec.overlap(seqA, seqB, limit=None) - assert len(matches) >= 1 - - -# --------------------------------------------------------------------------- -# Recombinase class -# --------------------------------------------------------------------------- - - -def test_recombinase_class_basic(): - rec = Recombinase("ATGCCCTAAaaTT", "AAaaTTTTTTTCCCT") - - seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") - seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") - - matches = rec.overlap(seqA, seqB) - assert matches == [(12, 6, 2)] + assert len(matches) == 1 + assert matches[0] == (5, 14, 2) # --------------------------------------------------------------------------- @@ -191,48 +177,40 @@ def test_recombinase_class_basic(): def test_find_recombinase_sites(): site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + rec = Recombinase(site1, site2, site1_name="s1", site2_name="s2") seq = Dseqrecord(f"ggg{site1.upper()}ttt") - sites = find_recombinase_sites( - seq, site1, "GGGaaaCCC", site1_name="s1", site2_name="s2" - ) + sites = rec.find(seq) assert "s1" in sites - assert len(sites["s1"]) >= 1 + assert sites["s1"] == [SimpleLocation(3, 3 + len(site1), 1)] + # Works origin-spanning sites + seq_looped = seq.looped() + for shift in range(len(seq_looped)): + seq_looped_shifted = seq_looped.shifted(shift) + sites = rec.find(seq_looped_shifted) + assert str(sites["s1"][0].extract(seq_looped_shifted.seq)) == site1.upper() -def test_annotate_recombinase_sites(): - site1 = "ATGCCCTAAaaTT" - seq = Dseqrecord(f"ggg{site1.upper()}ttt") - assert len(seq.features) == 0 - annotate_recombinase_sites( - seq, site1, "GGGaaaCCC", site1_name="mysite1", site2_name="mysite2" - ) - assert len(seq.features) >= 1 - assert any(f.qualifiers.get("label", []) == ["mysite1"] for f in seq.features) + # Works with multiple sites + seq = Dseqrecord(f"ggg{site1.upper()}ttt{site2.upper()}ttt{site1.upper()}ttt") + sites = rec.find(seq) + assert len(sites["s1"]) == 2 + assert len(sites["s2"]) == 1 - # Calling again should not duplicate - n_before = len(seq.features) - annotate_recombinase_sites( - seq, site1, "GGGaaaCCC", site1_name="mysite1", site2_name="mysite2" - ) - assert len(seq.features) == n_before - -# --------------------------------------------------------------------------- -# Symmetry: site1 in seqy and site2 in seqx should also work -# --------------------------------------------------------------------------- - - -def test_sites_swapped_between_sequences(): - """With site1 in seqx and site2 in seqy (can be either order in Assembly).""" +def test_annotate_recombinase_sites(): site1 = "ATGCCCTAAaaTT" site2 = "AAaaTTTTTTTCCCT" + rec = Recombinase(site1, site2, site1_name="mysite1", site2_name="mysite2") + seq = Dseqrecord(f"ggg{site1.upper()}ttt") - seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") - seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + annotated_seq = rec.annotate(seq) + assert len(annotated_seq.features) == 1 + assert annotated_seq.features[0].qualifiers.get("label", []) == ["mysite1"] + assert annotated_seq.features[0].location == SimpleLocation(3, 3 + len(site1), 1) - rec = Recombinase(site1, site2) - matches = rec.overlap(seqA, seqB, limit=None) - assert len(matches) >= 1 + # Does not change original sequence + assert len(seq.features) == 0 # --------------------------------------------------------------------------- @@ -242,32 +220,35 @@ def test_sites_swapped_between_sequences(): def test_recombinase_integration(): """Integration of insert into genome via recombinase sites.""" - site1 = "ATGCCCTAAaaTT" - site2 = "AAaaTTTTTTTCCCT" + site1 = "ATGCCCTAAaaCT" + site2 = "CAaaTTTTTTTCCCT" genome = Dseqrecord(f"cccccc{site1.upper()}aaaaa") insert = Dseqrecord(f"{site2.upper()}bbbbb", circular=True) + rec = Recombinase(site1, site2) - products = recombinase_integration(genome, [insert], site1, site2) - assert len(products) >= 1 - # Integrated product should contain both flanking regions and insert - seq_str = str(products[0].seq).upper() - assert "ATGCCCTAA" in seq_str - assert "TTTTTTTCCCT" in seq_str + products = recombinase_integration(genome, [insert], rec) + + assert len(products) == 1 + assert len(products[0].features) == 0 + assert str(products[0].seq) == "ccccccATGCCCTAAAATTTTTTTCCCTbbbbbCAAACTaaaaa" def test_recombinase_excision(): """Excision of plasmid from genome with two recombinase sites.""" - site1 = "ATGCCCTAAaaTT" + site1 = "ATGCCCTAAaaCT" site2 = "AAaaTTTTTTTCCCT" + rec = Recombinase(site1, site2) genome = Dseqrecord( - f"cccccc{site1.upper()}xxxxx{site2.upper()}aaaaa", + f"cccccc{site1.upper()}tttt{site2.upper()}aaaaa", circular=True, ) - products = recombinase_excision(genome, site1, site2) - assert len(products) >= 1 + products = recombinase_excision(genome, rec) + assert len(products) == 2 + assert str(products[0].seq) == "ccccccATGCCCTAAaaTTTTTTTCCCTtttt" + # assert str(products[1].seq) == "ccccccATGCCCTAAAATTTTTTTCCCTbbbbbCAAACTaaaaa" def test_recombinase_integration_excision_reversibility(): From f14093d37fa1c3ecf403ada0a296914543344ec4 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 25 Feb 2026 12:12:01 +0000 Subject: [PATCH 03/10] tests passing --- dummy.py | 25 ------------------------- src/pydna/recombinase.py | 2 ++ tests/test_module_recombinase.py | 24 +++++++++++++----------- 3 files changed, 15 insertions(+), 36 deletions(-) delete mode 100644 dummy.py diff --git a/dummy.py b/dummy.py deleted file mode 100644 index c770d819..00000000 --- a/dummy.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- coding: utf-8 -*- -from pydna.dseqrecord import Dseqrecord -from pydna.assembly2 import recombinase_excision -from pydna.recombinase import Recombinase -from pydna.assembly2 import SingleFragmentAssembly - - -site1 = "ATGCCCTAAaaCT" -site2 = "AAaaTTTTTTTCCCT" - -rec = Recombinase(site1, site2) -genome = Dseqrecord(f"cccccc{site1.upper()}tttt{site2.upper()}aaaaa") - -products = recombinase_excision(genome, rec) - -asm = SingleFragmentAssembly([genome], None, rec.overlap) -print(*asm.G.edges(data=True), sep="\n") -print(asm.get_circular_assemblies()) -print() - -seq2 = Dseqrecord(f"ag{site1.upper()}gtca{site1.upper()}aa") - -asm = SingleFragmentAssembly([seq2], limit=13) -print(*asm.G.edges(data=True), sep="\n") -# print(asm.get_circular_assemblies()) diff --git a/src/pydna/recombinase.py b/src/pydna/recombinase.py index 3f2669eb..87e65171 100644 --- a/src/pydna/recombinase.py +++ b/src/pydna/recombinase.py @@ -129,6 +129,8 @@ def __init__( self._configs = [ (self._site1_fwd, self._site2_fwd, off1, off2), (self._site1_rev, self._site2_rev, off1_rev, off2_rev), + (self._site2_fwd, self._site1_fwd, off2, off1), + (self._site2_rev, self._site1_rev, off2_rev, off1_rev), ] def overlap( diff --git a/tests/test_module_recombinase.py b/tests/test_module_recombinase.py index b0aaf236..88891a37 100644 --- a/tests/test_module_recombinase.py +++ b/tests/test_module_recombinase.py @@ -2,6 +2,7 @@ import pytest from Bio.SeqFeature import SimpleLocation from pydna.dseqrecord import Dseqrecord +from pydna.dseq import Dseq from pydna.assembly2 import ( Assembly, recombinase_integration, @@ -240,30 +241,31 @@ def test_recombinase_excision(): site2 = "AAaaTTTTTTTCCCT" rec = Recombinase(site1, site2) - genome = Dseqrecord( - f"cccccc{site1.upper()}tttt{site2.upper()}aaaaa", - circular=True, - ) + genome = Dseqrecord(f"cccccc{site1.upper()}tttt{site2.upper()}aaaaa") products = recombinase_excision(genome, rec) assert len(products) == 2 - assert str(products[0].seq) == "ccccccATGCCCTAAaaTTTTTTTCCCTtttt" - # assert str(products[1].seq) == "ccccccATGCCCTAAAATTTTTTTCCCTbbbbbCAAACTaaaaa" + assert products[0].seq.upper() == Dseq("AACTttttAA".upper()) + assert ( + products[1].seq.upper() + == Dseq("ccccccATGCCCTAAAATTTTTTTCCCTaaaaa", circular=True).upper() + ) def test_recombinase_integration_excision_reversibility(): """With same site on both sides (Cre-Lox style), integrate then excise returns originals.""" - site = "ATGCCCTAAaaTT" + site = "ATGaaGTA" genome = Dseqrecord(f"cccccc{site.upper()}aaaaa") insert = Dseqrecord(f"{site.upper()}bbbbb", circular=True) + rec = Recombinase(site, site) - products = recombinase_integration(genome, [insert], site, site) + products = recombinase_integration(genome, [insert], rec) assert len(products) == 1 integrated = products[0] - excised = recombinase_excision(integrated, site, site) + excised = recombinase_excision(integrated, rec) assert len(excised) == 2 - seqs = [str(p.seq).upper() for p in excised] - assert any(site.upper() in s for s in seqs) + assert excised[1].seq.seguid() == genome.seq.seguid() + assert excised[0].seq.seguid() == insert.seq.seguid() From 73474961bfd107818f522577be91650bf4677008 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 25 Feb 2026 12:26:56 +0000 Subject: [PATCH 04/10] allow reversing recombinase --- src/pydna/recombinase.py | 39 ++++++++++++++++++++++++++++++++ tests/test_module_recombinase.py | 11 +++++++++ 2 files changed, 50 insertions(+) diff --git a/src/pydna/recombinase.py b/src/pydna/recombinase.py index 87e65171..f41dedd8 100644 --- a/src/pydna/recombinase.py +++ b/src/pydna/recombinase.py @@ -133,6 +133,45 @@ def __init__( (self._site2_rev, self._site1_rev, off2_rev, off1_rev), ] + def __repr__(self) -> str: + return f"Recombinase(site1={self.site1}, site2={self.site2}, site1_name={self.site1_name}, site2_name={self.site2_name})" + + def get_reverse_recombinase( + self, site12_name: str = "site12", site21_name: str = "site21" + ) -> "Recombinase": + """Get a recombinase that does the reverse reaction. + + Parameters + ---------- + site12_name : str, optional + Label for site12 in find/annotate output. Default is "site12". + site21_name : str, optional + Label for site21 in find/annotate output. Default is "site21". + + Returns + ------- + Recombinase + A recombinase that does the reverse reaction. + + Examples + -------- + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase + >>> rec = Recombinase("ATGCCCTAAaaTT", "AAaaTTTTTTTCCCT") + >>> rec.get_reverse_recombinase() + Recombinase(site1=ATGCCCTAAaaTTTTTTTCCCT, site2=AAaaTT, site1_name=site12, site2_name=site21) + """ + _, _, off1, off2 = self._configs[0] + site12 = ( + self.site1[: off1 + self._homology_len] + + self.site2[off2 + self._homology_len :] + ) + site21 = ( + self.site2[: off2 + self._homology_len] + + self.site1[off1 + self._homology_len :] + ) + return Recombinase(site12, site21, site12_name, site21_name) + def overlap( self, seqx: Dseqrecord, diff --git a/tests/test_module_recombinase.py b/tests/test_module_recombinase.py index 88891a37..6ed3ece1 100644 --- a/tests/test_module_recombinase.py +++ b/tests/test_module_recombinase.py @@ -269,3 +269,14 @@ def test_recombinase_integration_excision_reversibility(): assert excised[1].seq.seguid() == genome.seq.seguid() assert excised[0].seq.seguid() == insert.seq.seguid() + + +def test_recombinase_reverse(): + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + rec = Recombinase(site1, site2) + rev_rec = rec.get_reverse_recombinase() + assert rev_rec.site1 == "ATGCCCTAAaaTTTTTTTCCCT" + assert rev_rec.site2 == "AAaaTT" + assert rev_rec.site1_name == "site12" + assert rev_rec.site2_name == "site21" From 450b340f145f8c904c655db9d6848593d7c8a145 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 25 Feb 2026 16:17:49 +0000 Subject: [PATCH 05/10] used recombinase class in gateway --- src/pydna/gateway.py | 241 ++++++++++++++------------------- tests/test_module_assembly2.py | 2 +- 2 files changed, 102 insertions(+), 141 deletions(-) diff --git a/src/pydna/gateway.py b/src/pydna/gateway.py index e9a00e2e..0076f71e 100644 --- a/src/pydna/gateway.py +++ b/src/pydna/gateway.py @@ -1,164 +1,125 @@ # -*- coding: utf-8 -*- -from Bio.Seq import reverse_complement from pydna.dseqrecord import Dseqrecord -import re -import itertools -from Bio.SeqFeature import SimpleLocation, SeqFeature -from pydna.utils import shift_location -from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer - - -raw_gateway_common = { - "attB1": "CHWVTWTGTACAAAAAANNNG", - "attB2": "CHWVTWTGTACAAGAAANNNG", - "attB3": "CHWVTWTGTATAATAAANNNG", - "attB4": "CHWVTWTGTATAGAAAANNNG", - "attB5": "CHWVTWTGTATACAAAANNNG", - "attL1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAANNNG", - "attL2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAANNNG", - "attL3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAANNNG", - "attL4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAANNNG", - "attL5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAANNNG", - "attR1": "CHWVTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attR2": "CHWVTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attR3": "CHWVTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attR4": "CHWVTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attR5": "CHWVTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "overlap_1": "twtGTACAAAaaa", - "overlap_2": "twtGTACAAGaaa", - "overlap_3": "twtGTATAATaaa", - "overlap_4": "twtGTATAGAaaa", - "overlap_5": "twtGTATACAaaa", -} - - -raw_gateway_sites_greedy = { - **raw_gateway_common, - "attP1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attP2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attP3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attP4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attP5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", -} - -raw_gateway_sites_conservative = { - **raw_gateway_common, - "attP1": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", - "attP2": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAGAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", - "attP3": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAATAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", - "attP4": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAGAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", - "attP5": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATACAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", -} - -gateway_sites_greedy = { - k: { - "forward_regex": compute_regex_site(v), - "reverse_regex": compute_regex_site(reverse_complement(v)), - "consensus_sequence": v, +from Bio.SeqFeature import SimpleLocation +from pydna.recombinase import Recombinase + + +def create_recombinase_dict() -> dict[str, dict[str, list[Recombinase]]]: + """Create a dictionary of recombinases for the Gateway reaction.""" + raw_gateway_common = { + "attB1": "CHWVTWTgtacaaaAAANNNG", + "attB2": "CHWVTWTgtacaagAAANNNG", + "attB3": "CHWVTWTgtataatAAANNNG", + "attB4": "CHWVTWTgtatagaAAANNNG", + "attB5": "CHWVTWTgtatacaAAANNNG", + "attL1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtacaaaAAANNNG", + "attL2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtacaagAAANNNG", + "attL3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtataatAAANNNG", + "attL4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtatagaAAANNNG", + "attL5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtatacaAAANNNG", + "attR1": "CHWVTWTgtacaaaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attR2": "CHWVTWTgtacaagAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attR3": "CHWVTWTgtataatAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attR4": "CHWVTWTgtatagaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attR5": "CHWVTWTgtatacaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "overlap_1": "TWTgtacaaaAAA", + "overlap_2": "TWTgtacaagAAA", + "overlap_3": "TWTgtataatAAA", + "overlap_4": "TWTgtatagaAAA", + "overlap_5": "TWTgtatacaAAA", } - for k, v in raw_gateway_sites_greedy.items() -} - -gateway_sites_conservative = { - k: { - "forward_regex": compute_regex_site(v), - "reverse_regex": compute_regex_site(reverse_complement(v)), - "consensus_sequence": v, + + raw_gateway_sites_greedy = { + **raw_gateway_common, + "attP1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtacaaaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attP2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtacaagAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attP3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtataatAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attP4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtatagaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attP5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtatacaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + } + + raw_gateway_sites_conservative = { + **raw_gateway_common, + "attP1": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTgtacaaaAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", + "attP2": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTgtacaagAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", + "attP3": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTgtataatAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", + "attP4": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTgtatagaAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", + "attP5": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTgtatacaAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", } - for k, v in raw_gateway_sites_conservative.items() -} -# From snapgene - ask Valerie -primer_design_attB = { - "attB1": "ACAAGTTTGTACAAAAAAGCAGGCT", - "attB2": "ACCACTTTGTACAAGAAAGCTGGGT", - "attB3": "ACAACTTTGTATAATAAAGTTGTA", - "attB4": "ACAACTTTGTATAGAAAAGTTGTA", - "attB5": "ACAACTTTGTATACAAAAGTTGTA", -} + out_dict = {} + + for reaction in ["BP", "LR"]: + left, right = reaction + conservative: list[Recombinase] = [] + greedy: list[Recombinase] = [] + for i in range(1, 6): + site1 = f"att{left}{i}" + site2 = f"att{right}{i}" + seq1_conservative = raw_gateway_sites_conservative[site1] + seq2_conservative = raw_gateway_sites_conservative[site2] + seq1_greedy = raw_gateway_sites_greedy[site1] + seq2_greedy = raw_gateway_sites_greedy[site2] + conservative.append( + Recombinase(seq1_conservative, seq2_conservative, site1, site2) + ) + greedy.append(Recombinase(seq1_greedy, seq2_greedy, site1, site2)) + + out_dict[reaction] = { + "conservative": conservative, + "greedy": greedy, + } + return out_dict + + +recombinase_dict = create_recombinase_dict() def gateway_overlap( seqx: Dseqrecord, seqy: Dseqrecord, reaction: str, greedy: bool ) -> list[tuple[int, int, int]]: """ - Find gateway overlaps. If greedy is True, it uses a more greedy consensus site to find attP sites, - which might give false positives + Assembly Algorithm: Find gateway overlaps. If greedy is True, it uses a more greedy consensus site to find attP sites, + which might give false positives. + + Parameters + ---------- + seqx : Dseqrecord + First sequence to find overlaps. + seqy : Dseqrecord + Second sequence to find overlaps. + reaction : str + Type of Gateway reaction (BP or LR). + greedy : bool + If True, use greedy gateway consensus sites. + + Returns + ------- + list[tuple[int, int, int]] A list of overlaps between the two sequences. """ - if reaction not in ["BP", "LR"]: - raise ValueError(f"Invalid overlap type: {reaction}") - - gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative - out = list() - # Iterate over the four possible att sites - for num in range(1, 5): - # Iterate over the two possible orientations - # The sites have to be in the same orientation (fwd + fwd or rev + rev) - for pattern in ["forward_regex", "reverse_regex"]: - # The overlap regex is the same for all types - overlap_regex = gateway_sites[f"overlap_{num}"][pattern] - - # Iterate over pairs B, P and P, B for BP and L, R and R, L for LR - for site_x, site_y in zip(reaction, reaction[::-1]): - site_x_regex = gateway_sites[f"att{site_x}{num}"][pattern] - matches_x = list(dseqrecord_finditer(site_x_regex, seqx)) - if len(matches_x) == 0: - continue - - site_y_regex = gateway_sites[f"att{site_y}{num}"][pattern] - matches_y = list(dseqrecord_finditer(site_y_regex, seqy)) - if len(matches_y) == 0: - continue - - for match_x, match_y in itertools.product(matches_x, matches_y): - # Find the overlap sequence within each match, and use the - # core 7 pbs that are constant - overlap_x = re.search(overlap_regex, match_x.group()) - overlap_y = re.search(overlap_regex, match_y.group()) - - # Sanity check - assert ( - overlap_x is not None and overlap_y is not None - ), "Something went wrong, no overlap found within the matches" - - out.append( - ( - match_x.start() + overlap_x.start() + 3, - match_y.start() + overlap_y.start() + 3, - 7, - ) - ) - - return out + type = "greedy" if greedy else "conservative" + recombinases = recombinase_dict[reaction][type] + return sum((r.overlap(seqx, seqy) for r in recombinases), []) def find_gateway_sites( seq: Dseqrecord, greedy: bool ) -> dict[str, list[SimpleLocation]]: """Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites.""" - gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative + + type = "greedy" if greedy else "conservative" out = dict() - for site in gateway_sites: - if not site.startswith("att"): - continue - - for pattern in ["forward_regex", "reverse_regex"]: - matches = list(dseqrecord_finditer(gateway_sites[site][pattern], seq)) - for match in matches: - if site not in out: - out[site] = [] - strand = 1 if pattern == "forward_regex" else -1 - loc = SimpleLocation(match.start(), match.end(), strand) - loc = shift_location(loc, 0, len(seq)) - out[site].append(loc) + for reaction in ["BP", "LR"]: + for rec in recombinase_dict[reaction][type]: + out.update(rec.find(seq)) return out def annotate_gateway_sites(seq: Dseqrecord, greedy: bool) -> Dseqrecord: - sites = find_gateway_sites(seq, greedy) - for site in sites: - for loc in sites[site]: - seq.features.append( - SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]}) - ) + """Annotate gateway sites in a sequence.""" + type = "greedy" if greedy else "conservative" + out = seq + for reaction in ["BP", "LR"]: + for rec in recombinase_dict[reaction][type]: + out = rec.annotate(out) return seq diff --git a/tests/test_module_assembly2.py b/tests/test_module_assembly2.py index 19ee9084..8f1ca2d7 100644 --- a/tests/test_module_assembly2.py +++ b/tests/test_module_assembly2.py @@ -2638,7 +2638,7 @@ def test_gateway_assembly(): # Test that greedy is being used (finds) with pytest.raises(ValueError) as e: assembly.gateway_assembly(products_LR, "LR", greedy=True) - assert "fragment 2: attB1, attL1, attR1, attP1" in str(e.value) + assert "fragment 2: attB1, attP1, attL1, attR1" in str(e.value) # Test multi site only seq1 = Dseqrecord("aaa" + attB1 + "ggg" + attB2 + "ccc", circular=True) From e526a51dd1a8775442453f7f5c7cf84899dc7229 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 25 Feb 2026 17:22:43 +0000 Subject: [PATCH 06/10] add collection of recombinases --- src/pydna/assembly2.py | 4 +- src/pydna/gateway.py | 31 +++++---- src/pydna/recombinase.py | 108 +++++++++++++++++++++++++++++-- tests/test_module_recombinase.py | 61 +++++++++++++++++ 4 files changed, 182 insertions(+), 22 deletions(-) diff --git a/src/pydna/assembly2.py b/src/pydna/assembly2.py index 2c874dc3..ac86722e 100644 --- a/src/pydna/assembly2.py +++ b/src/pydna/assembly2.py @@ -2855,11 +2855,13 @@ def recombinase_integration( -------- >>> from pydna.dseqrecord import Dseqrecord >>> from pydna.assembly2 import recombinase_integration, recombinase_excision + >>> from pydna.recombinase import Recombinase >>> site1 = "ATGCCCTAAaaTT" >>> site2 = "AAaaTTTTTTTCCCT" >>> genome = Dseqrecord(f"cccccc{site1.upper()}aaaaa") >>> insert = Dseqrecord(f"{site2.upper()}bbbbb", circular=True) - >>> products = recombinase_integration(genome, [insert], site1, site2) + >>> recombinase = Recombinase(site1, site2) + >>> products = recombinase_integration(genome, [insert], recombinase) >>> len(products) >= 1 True """ diff --git a/src/pydna/gateway.py b/src/pydna/gateway.py index 0076f71e..212eaee3 100644 --- a/src/pydna/gateway.py +++ b/src/pydna/gateway.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- from pydna.dseqrecord import Dseqrecord from Bio.SeqFeature import SimpleLocation -from pydna.recombinase import Recombinase +from pydna.recombinase import Recombinase, RecombinaseCollection -def create_recombinase_dict() -> dict[str, dict[str, list[Recombinase]]]: +def create_recombinase_dict() -> dict[str, dict[str, RecombinaseCollection]]: """Create a dictionary of recombinases for the Gateway reaction.""" raw_gateway_common = { "attB1": "CHWVTWTgtacaaaAAANNNG", @@ -66,8 +66,8 @@ def create_recombinase_dict() -> dict[str, dict[str, list[Recombinase]]]: greedy.append(Recombinase(seq1_greedy, seq2_greedy, site1, site2)) out_dict[reaction] = { - "conservative": conservative, - "greedy": greedy, + "conservative": RecombinaseCollection(conservative), + "greedy": RecombinaseCollection(greedy), } return out_dict @@ -98,8 +98,7 @@ def gateway_overlap( list[tuple[int, int, int]] A list of overlaps between the two sequences. """ type = "greedy" if greedy else "conservative" - recombinases = recombinase_dict[reaction][type] - return sum((r.overlap(seqx, seqy) for r in recombinases), []) + return recombinase_dict[reaction][type].overlap(seqx, seqy) def find_gateway_sites( @@ -108,18 +107,18 @@ def find_gateway_sites( """Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites.""" type = "greedy" if greedy else "conservative" - out = dict() - for reaction in ["BP", "LR"]: - for rec in recombinase_dict[reaction][type]: - out.update(rec.find(seq)) - return out + collection = RecombinaseCollection( + recombinase_dict["BP"][type].recombinases + + recombinase_dict["LR"][type].recombinases + ) + return collection.find(seq) def annotate_gateway_sites(seq: Dseqrecord, greedy: bool) -> Dseqrecord: """Annotate gateway sites in a sequence.""" type = "greedy" if greedy else "conservative" - out = seq - for reaction in ["BP", "LR"]: - for rec in recombinase_dict[reaction][type]: - out = rec.annotate(out) - return seq + collection = RecombinaseCollection( + recombinase_dict["BP"][type].recombinases + + recombinase_dict["LR"][type].recombinases + ) + return collection.annotate(seq) diff --git a/src/pydna/recombinase.py b/src/pydna/recombinase.py index f41dedd8..98754287 100644 --- a/src/pydna/recombinase.py +++ b/src/pydna/recombinase.py @@ -11,13 +11,79 @@ For example:: - site1 = "ATGCCCTAAaaTT" - site2 = "AAaaTTTTTTTCCCT" + >>> site1 = "ATGCCCTAAaaTT" + >>> site2 = "AAaaTTTTTTTCCCT" The lowercase ``aa`` is the overlap that will appear in the assembled product. Sites may contain IUPAC degenerate bases (N, W, S, etc.) and the search handles both linear and circular sequences. + +The easiest way to use it is with the recombinase_integration and +recombinase_excision functions from the assembly2 module. + +Integration and excision with a single recombinase:: + + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase + >>> from pydna.assembly2 import recombinase_integration, recombinase_excision + >>> site1 = "ATGCCCTAAaaCT" + >>> site2 = "CAaaTTTTTTTCCCT" + >>> genome = Dseqrecord("ccccccATGCCCTAAAACTaaaaa") + >>> insert = Dseqrecord("CAAATTTTTTTCCCTbbbbb", circular=True) + >>> rec = Recombinase(site1, site2) + >>> products = recombinase_integration(genome, [insert], rec) + >>> + >>> # Cre-Lox style (same site on both sides): integrate then excise returns originals + >>> site = "ATGaaGTA" + >>> genome = Dseqrecord("ccccccATGAAGTAAAAA") + >>> insert = Dseqrecord("ATGAAGTABbbbb", circular=True) + >>> rec = Recombinase(site, site) + >>> integrated = recombinase_integration(genome, [insert], rec)[0] + >>> excised = recombinase_excision(integrated, rec) + +Find and annotate sites in a sequence:: + + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase + >>> site1, site2 = "ATGCCCTAAaaTT", "AAaaTTTTTTTCCCT" + >>> rec = Recombinase(site1, site2, site1_name="mysite1", site2_name="mysite2") + >>> seq = Dseqrecord("gggATGCCCTAAaaTTttt") + >>> sites = rec.find(seq) + >>> annotated = rec.annotate(seq) + +When several sites are possible, or when using multiple recombinases, you can +create a RecombinaseCollection. It has the methods overlap, find, and annotate, +so you can use it as a single Recombinase. For an example, check the gateway module:: + + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase, RecombinaseCollection + >>> rec1 = Recombinase("AAaaTTC", "CCaaTTC", site1_name="s1", site2_name="s2") + >>> rec2 = Recombinase("GAccACC", "TCccAAC", site1_name="s3", site2_name="s4") + >>> collection = RecombinaseCollection([rec1, rec2]) + >>> seq = Dseqrecord("gggAAAATTCTtttGACCACCTttt") + >>> sites = collection.find(seq) + >>> annotated = collection.annotate(seq) + +IUPAC degenerate bases (e.g. N matches any base):: + + >>> site1 = "ATGNNNaaTT" + >>> site2 = "CCNNaaTTGG" + >>> rec = Recombinase(site1, site2) + +Using a Recombinase as Assembly algorithm:: + + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase + >>> from pydna.assembly2 import Assembly + >>> site1 = "ATGCCCTAAaaTT" + >>> site2 = "AAaaTTTTTTTCCCT" + >>> seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + >>> seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + >>> rec = Recombinase(site1, site2) + >>> asm = Assembly([seqA, seqB], algorithm=rec.overlap, use_fragment_order=False) + >>> products = asm.assemble_linear() + """ import re @@ -89,10 +155,9 @@ class Recombinase: >>> rec = Recombinase("ATGCCCTAAaaTT", "AAaaTTTTTTTCCCT") >>> seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") >>> seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") - >>> rec.overlap(seqA, seqB) - [(12, 6, 2)] + >>> _ = rec.overlap(seqA, seqB) >>> sites = rec.find(seqA) - >>> rec.annotate(seqA) + >>> _ = rec.annotate(seqA) """ def __init__( @@ -281,3 +346,36 @@ def annotate(self, seq: Dseqrecord) -> Dseqrecord: ) ) return out_seq + + +class RecombinaseCollection: + """A collection of recombinases.""" + + def __init__(self, recombinases: list[Recombinase]): + if not isinstance(recombinases, list): + raise ValueError("recombinases must be a list of Recombinase objects") + if not all(isinstance(r, Recombinase) for r in recombinases): + raise ValueError("recombinases must be a list of Recombinase objects") + if len(recombinases) == 0: + raise ValueError("recombinases must be a non-empty list") + self.recombinases = recombinases + + def overlap(self, seqx: Dseqrecord, seqy: Dseqrecord) -> list[SequenceOverlap]: + """Find overlaps between *seqx* and *seqy* mediated by the recombinases.""" + return sum((r.overlap(seqx, seqy) for r in self.recombinases), []) + + def find(self, seq: Dseqrecord) -> dict[str, list[SimpleLocation]]: + """Find all occurrences of the recombinase sites in *seq*.""" + out = dict() + for rec in self.recombinases: + rec_sites = rec.find(seq) + for k, v in rec_sites.items(): + out.setdefault(k, []).extend(v) + return out + + def annotate(self, seq: Dseqrecord) -> Dseqrecord: + """Annotate *seq* with features for all occurrences of the recombinase sites.""" + out = seq + for rec in self.recombinases: + out = rec.annotate(out) + return out diff --git a/tests/test_module_recombinase.py b/tests/test_module_recombinase.py index 6ed3ece1..0f0ce93c 100644 --- a/tests/test_module_recombinase.py +++ b/tests/test_module_recombinase.py @@ -11,6 +11,7 @@ from pydna.recombinase import ( _recombinase_homology_offset_and_length, Recombinase, + RecombinaseCollection, ) @@ -280,3 +281,63 @@ def test_recombinase_reverse(): assert rev_rec.site2 == "AAaaTT" assert rev_rec.site1_name == "site12" assert rev_rec.site2_name == "site21" + + +# --------------------------------------------------------------------------- +# RecombinaseCollection +# --------------------------------------------------------------------------- + + +def test_recombinase_collection(): + site1 = "AAaaTTC" + site2 = "CCaaTTC" + site3 = "GAccACC" + site4 = "TCccAAC" + rec1 = Recombinase(site1, site2, site1_name="s1", site2_name="s2") + rec2 = Recombinase(site3, site4, site1_name="s3", site2_name="s4") + collection = RecombinaseCollection([rec1, rec2]) + seq1 = Dseqrecord(f"ggg{site1.upper()}ttt{site3.upper()}ttt") + seq2 = Dseqrecord(f"gggc{site2.upper()}ttt{site4.upper()}ttt") + print(collection.overlap(seq1, seq2)) + assert collection.overlap(seq1, seq2) == [(5, 6, 2), (15, 16, 2)] + + +def test_recombinase_collection_find(): + # Here the important thing to test is that if + # two recombinases have the same site name, the find method should return + # both locations. + site1 = "AAaaTTC" + site2 = "CCaaTTC" + site3 = "GAccACC" + site4 = "TCccAAC" + rec1 = Recombinase(site1, site2, site1_name="s1", site2_name="s2") + rec2 = Recombinase(site3, site4, site1_name="s1", site2_name="s2") + collection = RecombinaseCollection([rec1, rec2]) + seq = Dseqrecord(f"ggg{site1.upper()}ttt{site3.upper()}ttt") + assert len(collection.find(seq)["s1"]) == 2 + + +def test_recombinase_collection_annotate(): + # Same as above, checking that two sites with the same name are annotated. + site1 = "AAaaTTC" + site2 = "CCaaTTC" + site3 = "GAccACC" + site4 = "TCccAAC" + rec1 = Recombinase(site1, site2, site1_name="s1", site2_name="s2") + rec2 = Recombinase(site3, site4, site1_name="s1", site2_name="s2") + collection = RecombinaseCollection([rec1, rec2]) + seq = Dseqrecord(f"ggg{site1.upper()}ttt{site3.upper()}ttt") + annotated_seq = collection.annotate(seq) + assert len(annotated_seq.features) == 2 + assert annotated_seq.features[0].qualifiers.get("label", []) == ["s1"] + assert annotated_seq.features[1].qualifiers.get("label", []) == ["s1"] + + +def test_recombinase_collection_init_errors(): + # Check that the init method raises errors for invalid inputs. + with pytest.raises(ValueError): + RecombinaseCollection(None) + with pytest.raises(ValueError): + RecombinaseCollection([]) + with pytest.raises(ValueError): + RecombinaseCollection([Recombinase("AAaaTTC", "CCaaTTC"), "blah"]) From 49f10d07d3de50c6a876808c96a048cec67a5fc4 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 25 Feb 2026 17:28:48 +0000 Subject: [PATCH 07/10] extra docs --- src/pydna/recombinase.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/pydna/recombinase.py b/src/pydna/recombinase.py index 98754287..38a32d35 100644 --- a/src/pydna/recombinase.py +++ b/src/pydna/recombinase.py @@ -65,12 +65,20 @@ >>> sites = collection.find(seq) >>> annotated = collection.annotate(seq) -IUPAC degenerate bases (e.g. N matches any base):: +IUPAC degenerate bases work (e.g. N matches any base):: >>> site1 = "ATGNNNaaTT" >>> site2 = "CCNNaaTTGG" >>> rec = Recombinase(site1, site2) +If the recombinase reaction is reversible, you can get the reverse recombinase with the get_reverse_recombinase method. + + >>> rec = Recombinase('AAAcgCG', 'TTAcgATGA', site1_name="s1", site2_name="s2") + >>> rec + Recombinase(site1=AAAcgCG, site2=TTAcgATGA, site1_name=s1, site2_name=s2) + >>> rec.get_reverse_recombinase(site12_name="s1+s2", site21_name="s2+s1") + Recombinase(site1=AAAcgATGA, site2=TTAcgCG, site1_name=s1+s2, site2_name=s2+s1) + Using a Recombinase as Assembly algorithm:: >>> from pydna.dseqrecord import Dseqrecord From a7e56176548acbaf73439009166bb182b00ad228 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 25 Feb 2026 17:46:41 +0000 Subject: [PATCH 08/10] improve test coverage --- tests/test_module_gateway.py | 9 ++++++++- tests/test_module_recombinase.py | 7 +++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_module_gateway.py b/tests/test_module_gateway.py index 3d24a3b6..402a15d7 100644 --- a/tests/test_module_gateway.py +++ b/tests/test_module_gateway.py @@ -1,4 +1,5 @@ -from pydna.gateway import gateway_overlap, find_gateway_sites +# -*- coding: utf-8 -*- +from pydna.gateway import annotate_gateway_sites, gateway_overlap, find_gateway_sites import pydna.assembly2 as assembly import glob from pydna.parsers import parse_snapgene @@ -119,3 +120,9 @@ def test_find_gateway_sites(): assert find_gateway_sites(seq, greedy=False) == { "attB3": [SimpleLocation(0, 21, 1), SimpleLocation(24, 45, 1)] } + + +def test_annotate_gateway_sites(): + seq = Dseqrecord("CAACTTTGTATACAAAAGTTGaaaCAACTTTGTATAATAAAGTTG") + annotated_seq = annotate_gateway_sites(seq, greedy=False) + assert len(annotated_seq.features) == 2 diff --git a/tests/test_module_recombinase.py b/tests/test_module_recombinase.py index 0f0ce93c..a5b83304 100644 --- a/tests/test_module_recombinase.py +++ b/tests/test_module_recombinase.py @@ -49,6 +49,13 @@ def test_recombinase_homology_offset_and_length_invalid_chars(): # --------------------------------------------------------------------------- +def test_recombinase_init_errors(): + # Different cores + with pytest.raises(ValueError) as e: + Recombinase("ATGacCTAAATT", "AAaaTTTTTTTCCCT") + assert "Recombinase recognition sites do not have matching" in str(e.value) + + def test_recombinase_overlap_single_match(): site1 = "ATGCCCTAAaaTT" site2 = "AAaaTTTTTTTCCCT" From 424ac4200f988b0fa9d349ec43535cb5bab14610 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 25 Feb 2026 17:50:12 +0000 Subject: [PATCH 09/10] copilot feedback --- src/pydna/gateway.py | 16 ++++++++-------- src/pydna/recombinase.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/pydna/gateway.py b/src/pydna/gateway.py index 212eaee3..471beb85 100644 --- a/src/pydna/gateway.py +++ b/src/pydna/gateway.py @@ -97,8 +97,8 @@ def gateway_overlap( ------- list[tuple[int, int, int]] A list of overlaps between the two sequences. """ - type = "greedy" if greedy else "conservative" - return recombinase_dict[reaction][type].overlap(seqx, seqy) + mode = "greedy" if greedy else "conservative" + return recombinase_dict[reaction][mode].overlap(seqx, seqy) def find_gateway_sites( @@ -106,19 +106,19 @@ def find_gateway_sites( ) -> dict[str, list[SimpleLocation]]: """Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites.""" - type = "greedy" if greedy else "conservative" + mode = "greedy" if greedy else "conservative" collection = RecombinaseCollection( - recombinase_dict["BP"][type].recombinases - + recombinase_dict["LR"][type].recombinases + recombinase_dict["BP"][mode].recombinases + + recombinase_dict["LR"][mode].recombinases ) return collection.find(seq) def annotate_gateway_sites(seq: Dseqrecord, greedy: bool) -> Dseqrecord: """Annotate gateway sites in a sequence.""" - type = "greedy" if greedy else "conservative" + mode = "greedy" if greedy else "conservative" collection = RecombinaseCollection( - recombinase_dict["BP"][type].recombinases - + recombinase_dict["LR"][type].recombinases + recombinase_dict["BP"][mode].recombinases + + recombinase_dict["LR"][mode].recombinases ) return collection.annotate(seq) diff --git a/src/pydna/recombinase.py b/src/pydna/recombinase.py index 38a32d35..7759da94 100644 --- a/src/pydna/recombinase.py +++ b/src/pydna/recombinase.py @@ -331,12 +331,12 @@ def annotate(self, seq: Dseqrecord) -> Dseqrecord: Parameters ---------- seq : Dseqrecord - Sequence to annotate (modified in place and returned). + Sequence to annotate. Returns ------- Dseqrecord - The same sequence with added features. + A copy of the sequence with added features. """ out_seq = copy.deepcopy(seq) sites = self.find(out_seq) From 56ad9b2dd2a83a80b003af6f36f9ec0fa95f84fc Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Wed, 25 Feb 2026 17:57:03 +0000 Subject: [PATCH 10/10] improve test coverage --- tests/test_module_recombinase.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_module_recombinase.py b/tests/test_module_recombinase.py index a5b83304..b2984bb5 100644 --- a/tests/test_module_recombinase.py +++ b/tests/test_module_recombinase.py @@ -339,6 +339,10 @@ def test_recombinase_collection_annotate(): assert annotated_seq.features[0].qualifiers.get("label", []) == ["s1"] assert annotated_seq.features[1].qualifiers.get("label", []) == ["s1"] + # It does not re-annotate if the sequence is already annotated. + annotated_seq = collection.annotate(annotated_seq) + assert len(annotated_seq.features) == 2 + def test_recombinase_collection_init_errors(): # Check that the init method raises errors for invalid inputs.