diff --git a/src/pydna/assembly2.py b/src/pydna/assembly2.py index d62f8fb7..ac86722e 100644 --- a/src/pydna/assembly2.py +++ b/src/pydna/assembly2.py @@ -39,6 +39,7 @@ from pydna.gateway import gateway_overlap, find_gateway_sites from pydna.cre_lox import cre_loxP_overlap from pydna.alphabet import anneal_strands +from pydna.recombinase import Recombinase from typing import TYPE_CHECKING, Callable, Literal from pydna.opencloning_models import ( @@ -52,6 +53,7 @@ GatewaySource, HomologousRecombinationSource, CreLoxRecombinationSource, + RecombinaseSource, PCRSource, SourceInput, CRISPRSource, @@ -1968,11 +1970,15 @@ def __init__(self, frags: [Dseqrecord], limit=25, algorithm=common_sub_strings): self.G.add_node(1, seq=frag) matches = algorithm(frag, frag, limit) + # Remove matches where the whole sequence matches + matches = [match for match in matches if match[2] != len(frag)] + for match in matches: self.add_edges_from_match(match, 1, 1, frag, frag) # To avoid duplicated outputs - self.G.remove_edges_from([(-1, -1)]) + while (-1, -1) in self.G.edges(): + self.G.remove_edges_from([(-1, -1)]) # These two are constrained self.use_fragment_order = True @@ -2801,6 +2807,72 @@ def cre_lox_excision(genome: Dseqrecord) -> list[Dseqrecord]: return _recast_sources(products, CreLoxRecombinationSource) +def recombinase_excision( + genome: Dseqrecord, + recombinase: Recombinase, +) -> list[Dseqrecord]: + """Returns the products for recombinase-mediated excision. + + Parameters + ---------- + genome : Dseqrecord + Target genome sequence containing two recombinase sites. + recombinase : Recombinase + Recombinase object. + + Returns + ------- + list[Dseqrecord] + List containing excised plasmid and remaining genome sequence. + """ + products = common_function_excision_products(genome, None, recombinase.overlap) + products = [recombinase.annotate(p) for p in products] + return _recast_sources(products, RecombinaseSource) + + +def recombinase_integration( + genome: Dseqrecord, + inserts: list[Dseqrecord], + recombinase: Recombinase, +) -> list[Dseqrecord]: + """Returns the products resulting from recombinase-mediated integration. + + Parameters + ---------- + genome : Dseqrecord + Target genome sequence. + inserts : list[Dseqrecord] + DNA fragment(s) to insert. + recombinase : Recombinase + Recombinase object. + + Returns + ------- + list[Dseqrecord] + List of integrated DNA molecules. + + Examples + -------- + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.assembly2 import recombinase_integration, recombinase_excision + >>> from pydna.recombinase import Recombinase + >>> site1 = "ATGCCCTAAaaTT" + >>> site2 = "AAaaTTTTTTTCCCT" + >>> genome = Dseqrecord(f"cccccc{site1.upper()}aaaaa") + >>> insert = Dseqrecord(f"{site2.upper()}bbbbb", circular=True) + >>> recombinase = Recombinase(site1, site2) + >>> products = recombinase_integration(genome, [insert], recombinase) + >>> len(products) >= 1 + True + """ + fragments = common_handle_insertion_fragments(genome, inserts) + products = common_function_integration_products( + fragments, None, recombinase.overlap + ) + products = [recombinase.annotate(p) for p in products] + return _recast_sources(products, RecombinaseSource) + + def crispr_integration( genome: Dseqrecord, inserts: list[Dseqrecord], diff --git a/src/pydna/gateway.py b/src/pydna/gateway.py index e9a00e2e..471beb85 100644 --- a/src/pydna/gateway.py +++ b/src/pydna/gateway.py @@ -1,164 +1,124 @@ # -*- coding: utf-8 -*- -from Bio.Seq import reverse_complement from pydna.dseqrecord import Dseqrecord -import re -import itertools -from Bio.SeqFeature import SimpleLocation, SeqFeature -from pydna.utils import shift_location -from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer - - -raw_gateway_common = { - "attB1": "CHWVTWTGTACAAAAAANNNG", - "attB2": "CHWVTWTGTACAAGAAANNNG", - "attB3": "CHWVTWTGTATAATAAANNNG", - "attB4": "CHWVTWTGTATAGAAAANNNG", - "attB5": "CHWVTWTGTATACAAAANNNG", - "attL1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAANNNG", - "attL2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAANNNG", - "attL3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAANNNG", - "attL4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAANNNG", - "attL5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAANNNG", - "attR1": "CHWVTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attR2": "CHWVTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attR3": "CHWVTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attR4": "CHWVTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attR5": "CHWVTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "overlap_1": "twtGTACAAAaaa", - "overlap_2": "twtGTACAAGaaa", - "overlap_3": "twtGTATAATaaa", - "overlap_4": "twtGTATAGAaaa", - "overlap_5": "twtGTATACAaaa", -} - - -raw_gateway_sites_greedy = { - **raw_gateway_common, - "attP1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attP2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attP3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attP4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", - "attP5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", -} - -raw_gateway_sites_conservative = { - **raw_gateway_common, - "attP1": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", - "attP2": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAGAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", - "attP3": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAATAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", - "attP4": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAGAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", - "attP5": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATACAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", -} - -gateway_sites_greedy = { - k: { - "forward_regex": compute_regex_site(v), - "reverse_regex": compute_regex_site(reverse_complement(v)), - "consensus_sequence": v, +from Bio.SeqFeature import SimpleLocation +from pydna.recombinase import Recombinase, RecombinaseCollection + + +def create_recombinase_dict() -> dict[str, dict[str, RecombinaseCollection]]: + """Create a dictionary of recombinases for the Gateway reaction.""" + raw_gateway_common = { + "attB1": "CHWVTWTgtacaaaAAANNNG", + "attB2": "CHWVTWTgtacaagAAANNNG", + "attB3": "CHWVTWTgtataatAAANNNG", + "attB4": "CHWVTWTgtatagaAAANNNG", + "attB5": "CHWVTWTgtatacaAAANNNG", + "attL1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtacaaaAAANNNG", + "attL2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtacaagAAANNNG", + "attL3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtataatAAANNNG", + "attL4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtatagaAAANNNG", + "attL5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtatacaAAANNNG", + "attR1": "CHWVTWTgtacaaaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attR2": "CHWVTWTgtacaagAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attR3": "CHWVTWTgtataatAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attR4": "CHWVTWTgtatagaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attR5": "CHWVTWTgtatacaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "overlap_1": "TWTgtacaaaAAA", + "overlap_2": "TWTgtacaagAAA", + "overlap_3": "TWTgtataatAAA", + "overlap_4": "TWTgtatagaAAA", + "overlap_5": "TWTgtatacaAAA", } - for k, v in raw_gateway_sites_greedy.items() -} - -gateway_sites_conservative = { - k: { - "forward_regex": compute_regex_site(v), - "reverse_regex": compute_regex_site(reverse_complement(v)), - "consensus_sequence": v, + + raw_gateway_sites_greedy = { + **raw_gateway_common, + "attP1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtacaaaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attP2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtacaagAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attP3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtataatAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attP4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtatagaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + "attP5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTgtatacaAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV", + } + + raw_gateway_sites_conservative = { + **raw_gateway_common, + "attP1": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTgtacaaaAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", + "attP2": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTgtacaagAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", + "attP3": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTgtataatAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", + "attP4": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTgtatagaAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", + "attP5": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTgtatacaAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA", } - for k, v in raw_gateway_sites_conservative.items() -} -# From snapgene - ask Valerie -primer_design_attB = { - "attB1": "ACAAGTTTGTACAAAAAAGCAGGCT", - "attB2": "ACCACTTTGTACAAGAAAGCTGGGT", - "attB3": "ACAACTTTGTATAATAAAGTTGTA", - "attB4": "ACAACTTTGTATAGAAAAGTTGTA", - "attB5": "ACAACTTTGTATACAAAAGTTGTA", -} + out_dict = {} + + for reaction in ["BP", "LR"]: + left, right = reaction + conservative: list[Recombinase] = [] + greedy: list[Recombinase] = [] + for i in range(1, 6): + site1 = f"att{left}{i}" + site2 = f"att{right}{i}" + seq1_conservative = raw_gateway_sites_conservative[site1] + seq2_conservative = raw_gateway_sites_conservative[site2] + seq1_greedy = raw_gateway_sites_greedy[site1] + seq2_greedy = raw_gateway_sites_greedy[site2] + conservative.append( + Recombinase(seq1_conservative, seq2_conservative, site1, site2) + ) + greedy.append(Recombinase(seq1_greedy, seq2_greedy, site1, site2)) + + out_dict[reaction] = { + "conservative": RecombinaseCollection(conservative), + "greedy": RecombinaseCollection(greedy), + } + return out_dict + + +recombinase_dict = create_recombinase_dict() def gateway_overlap( seqx: Dseqrecord, seqy: Dseqrecord, reaction: str, greedy: bool ) -> list[tuple[int, int, int]]: """ - Find gateway overlaps. If greedy is True, it uses a more greedy consensus site to find attP sites, - which might give false positives + Assembly Algorithm: Find gateway overlaps. If greedy is True, it uses a more greedy consensus site to find attP sites, + which might give false positives. + + Parameters + ---------- + seqx : Dseqrecord + First sequence to find overlaps. + seqy : Dseqrecord + Second sequence to find overlaps. + reaction : str + Type of Gateway reaction (BP or LR). + greedy : bool + If True, use greedy gateway consensus sites. + + Returns + ------- + list[tuple[int, int, int]] A list of overlaps between the two sequences. """ - if reaction not in ["BP", "LR"]: - raise ValueError(f"Invalid overlap type: {reaction}") - - gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative - out = list() - # Iterate over the four possible att sites - for num in range(1, 5): - # Iterate over the two possible orientations - # The sites have to be in the same orientation (fwd + fwd or rev + rev) - for pattern in ["forward_regex", "reverse_regex"]: - # The overlap regex is the same for all types - overlap_regex = gateway_sites[f"overlap_{num}"][pattern] - - # Iterate over pairs B, P and P, B for BP and L, R and R, L for LR - for site_x, site_y in zip(reaction, reaction[::-1]): - site_x_regex = gateway_sites[f"att{site_x}{num}"][pattern] - matches_x = list(dseqrecord_finditer(site_x_regex, seqx)) - if len(matches_x) == 0: - continue - - site_y_regex = gateway_sites[f"att{site_y}{num}"][pattern] - matches_y = list(dseqrecord_finditer(site_y_regex, seqy)) - if len(matches_y) == 0: - continue - - for match_x, match_y in itertools.product(matches_x, matches_y): - # Find the overlap sequence within each match, and use the - # core 7 pbs that are constant - overlap_x = re.search(overlap_regex, match_x.group()) - overlap_y = re.search(overlap_regex, match_y.group()) - - # Sanity check - assert ( - overlap_x is not None and overlap_y is not None - ), "Something went wrong, no overlap found within the matches" - - out.append( - ( - match_x.start() + overlap_x.start() + 3, - match_y.start() + overlap_y.start() + 3, - 7, - ) - ) - - return out + mode = "greedy" if greedy else "conservative" + return recombinase_dict[reaction][mode].overlap(seqx, seqy) def find_gateway_sites( seq: Dseqrecord, greedy: bool ) -> dict[str, list[SimpleLocation]]: """Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites.""" - gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative - out = dict() - for site in gateway_sites: - if not site.startswith("att"): - continue - - for pattern in ["forward_regex", "reverse_regex"]: - matches = list(dseqrecord_finditer(gateway_sites[site][pattern], seq)) - for match in matches: - if site not in out: - out[site] = [] - strand = 1 if pattern == "forward_regex" else -1 - loc = SimpleLocation(match.start(), match.end(), strand) - loc = shift_location(loc, 0, len(seq)) - out[site].append(loc) - return out + + mode = "greedy" if greedy else "conservative" + collection = RecombinaseCollection( + recombinase_dict["BP"][mode].recombinases + + recombinase_dict["LR"][mode].recombinases + ) + return collection.find(seq) def annotate_gateway_sites(seq: Dseqrecord, greedy: bool) -> Dseqrecord: - sites = find_gateway_sites(seq, greedy) - for site in sites: - for loc in sites[site]: - seq.features.append( - SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]}) - ) - return seq + """Annotate gateway sites in a sequence.""" + mode = "greedy" if greedy else "conservative" + collection = RecombinaseCollection( + recombinase_dict["BP"][mode].recombinases + + recombinase_dict["LR"][mode].recombinases + ) + return collection.annotate(seq) diff --git a/src/pydna/opencloning_models.py b/src/pydna/opencloning_models.py index 1353a763..9efc13ab 100644 --- a/src/pydna/opencloning_models.py +++ b/src/pydna/opencloning_models.py @@ -524,6 +524,10 @@ class CreLoxRecombinationSource(AssemblySource): ) +class RecombinaseSource(AssemblySource): + pass + + class PCRSource(AssemblySource): TARGET_MODEL: ClassVar[Type[_PCRSource]] = _PCRSource add_primer_features: bool = Field(default=False) diff --git a/src/pydna/recombinase.py b/src/pydna/recombinase.py new file mode 100644 index 00000000..7759da94 --- /dev/null +++ b/src/pydna/recombinase.py @@ -0,0 +1,389 @@ +# -*- coding: utf-8 -*- +""" +Generic site-specific recombinase functionality. + +This module provides tools for simulating site-specific recombination reactions +mediated by recombinases (e.g. phage integrases such as phiC31, Bxb1, etc.). + +Recombinase sites are specified as strings where the **lowercase** portion +indicates the homology/overlap core shared between the two sites. The +uppercase portion represents the flanking recognition arms. + +For example:: + + >>> site1 = "ATGCCCTAAaaTT" + >>> site2 = "AAaaTTTTTTTCCCT" + +The lowercase ``aa`` is the overlap that will appear in the assembled product. + +Sites may contain IUPAC degenerate bases (N, W, S, etc.) and the search +handles both linear and circular sequences. + +The easiest way to use it is with the recombinase_integration and +recombinase_excision functions from the assembly2 module. + +Integration and excision with a single recombinase:: + + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase + >>> from pydna.assembly2 import recombinase_integration, recombinase_excision + >>> site1 = "ATGCCCTAAaaCT" + >>> site2 = "CAaaTTTTTTTCCCT" + >>> genome = Dseqrecord("ccccccATGCCCTAAAACTaaaaa") + >>> insert = Dseqrecord("CAAATTTTTTTCCCTbbbbb", circular=True) + >>> rec = Recombinase(site1, site2) + >>> products = recombinase_integration(genome, [insert], rec) + >>> + >>> # Cre-Lox style (same site on both sides): integrate then excise returns originals + >>> site = "ATGaaGTA" + >>> genome = Dseqrecord("ccccccATGAAGTAAAAA") + >>> insert = Dseqrecord("ATGAAGTABbbbb", circular=True) + >>> rec = Recombinase(site, site) + >>> integrated = recombinase_integration(genome, [insert], rec)[0] + >>> excised = recombinase_excision(integrated, rec) + +Find and annotate sites in a sequence:: + + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase + >>> site1, site2 = "ATGCCCTAAaaTT", "AAaaTTTTTTTCCCT" + >>> rec = Recombinase(site1, site2, site1_name="mysite1", site2_name="mysite2") + >>> seq = Dseqrecord("gggATGCCCTAAaaTTttt") + >>> sites = rec.find(seq) + >>> annotated = rec.annotate(seq) + +When several sites are possible, or when using multiple recombinases, you can +create a RecombinaseCollection. It has the methods overlap, find, and annotate, +so you can use it as a single Recombinase. For an example, check the gateway module:: + + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase, RecombinaseCollection + >>> rec1 = Recombinase("AAaaTTC", "CCaaTTC", site1_name="s1", site2_name="s2") + >>> rec2 = Recombinase("GAccACC", "TCccAAC", site1_name="s3", site2_name="s4") + >>> collection = RecombinaseCollection([rec1, rec2]) + >>> seq = Dseqrecord("gggAAAATTCTtttGACCACCTttt") + >>> sites = collection.find(seq) + >>> annotated = collection.annotate(seq) + +IUPAC degenerate bases work (e.g. N matches any base):: + + >>> site1 = "ATGNNNaaTT" + >>> site2 = "CCNNaaTTGG" + >>> rec = Recombinase(site1, site2) + +If the recombinase reaction is reversible, you can get the reverse recombinase with the get_reverse_recombinase method. + + >>> rec = Recombinase('AAAcgCG', 'TTAcgATGA', site1_name="s1", site2_name="s2") + >>> rec + Recombinase(site1=AAAcgCG, site2=TTAcgATGA, site1_name=s1, site2_name=s2) + >>> rec.get_reverse_recombinase(site12_name="s1+s2", site21_name="s2+s1") + Recombinase(site1=AAAcgATGA, site2=TTAcgCG, site1_name=s1+s2, site2_name=s2+s1) + +Using a Recombinase as Assembly algorithm:: + + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase + >>> from pydna.assembly2 import Assembly + >>> site1 = "ATGCCCTAAaaTT" + >>> site2 = "AAaaTTTTTTTCCCT" + >>> seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + >>> seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + >>> rec = Recombinase(site1, site2) + >>> asm = Assembly([seqA, seqB], algorithm=rec.overlap, use_fragment_order=False) + >>> products = asm.assemble_linear() + +""" + +import re +import itertools +import copy + +from Bio.Seq import reverse_complement +from Bio.SeqFeature import SimpleLocation, SeqFeature + +from pydna.dseqrecord import Dseqrecord +from pydna.utils import shift_location +from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer +from pydna.types import SequenceOverlap + + +def _recombinase_homology_offset_and_length(site: str) -> tuple[int, int]: + """ + Return (offset, length) of the lowercase homology region inside a + recombinase recognition site string. + + The lowercase segment represents the shared homology between the two + recombinase sites. + + Parameters + ---------- + site : str + The recognition site sequence, with the homology region in lowercase. + Must contain only letters (no digits or symbols). + + Returns + ------- + offset : int + Index of the first lowercase character in the site. + length : int + Length of the contiguous lowercase homology region. + + Raises + ------ + ValueError + If the site contains invalid characters or no lowercase region. + """ + if not re.fullmatch(r"[A-Z]+[a-z]+[A-Z]+", site): + raise ValueError( + "Recombinase recognition site is not in the expected format." + "Expected format: [A-Z]+[a-z]+[A-Z]+, e.g. 'ATGCCCTAAaaTT'" + ) + match = re.search(r"[a-z]+", site) + return match.start(), len(match.group()) + + +class Recombinase: + """A site-specific recombinase defined by two recognition sites. Homology cores must match. + + Parameters + ---------- + site1 : str + The first recognition site. The homology core must be in lowercase. + site2 : str + The second recognition site. The homology core must be in lowercase. + site1_name : str, optional + Label for site1 in find/annotate output. Default is "site1". + site2_name : str, optional + Label for site2 in find/annotate output. Default is "site2". + + Examples + -------- + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase + >>> rec = Recombinase("ATGCCCTAAaaTT", "AAaaTTTTTTTCCCT") + >>> seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + >>> seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + >>> _ = rec.overlap(seqA, seqB) + >>> sites = rec.find(seqA) + >>> _ = rec.annotate(seqA) + """ + + def __init__( + self, + site1: str, + site2: str, + site1_name: str = "site1", + site2_name: str = "site2", + ): + self.site1 = site1 + self.site2 = site2 + self.site1_name = site1_name + self.site2_name = site2_name + + off1, len1 = _recombinase_homology_offset_and_length(site1) + off2, len2 = _recombinase_homology_offset_and_length(site2) + if site1[off1 : off1 + len1] != site2[off2 : off2 + len2]: + raise ValueError( + "Recombinase recognition sites do not have matching homology cores." + f"Expected {site1[off1:off1 + len1]} == {site2[off2:off2 + len2]}" + ) + self._homology_len = len1 + + rc1 = reverse_complement(site1) + rc2 = reverse_complement(site2) + off1_rev, _ = _recombinase_homology_offset_and_length(rc1) + off2_rev, _ = _recombinase_homology_offset_and_length(rc2) + + self._site1_fwd = compute_regex_site(site1) + self._site1_rev = compute_regex_site(rc1) + self._site2_fwd = compute_regex_site(site2) + self._site2_rev = compute_regex_site(rc2) + + self._configs = [ + (self._site1_fwd, self._site2_fwd, off1, off2), + (self._site1_rev, self._site2_rev, off1_rev, off2_rev), + (self._site2_fwd, self._site1_fwd, off2, off1), + (self._site2_rev, self._site1_rev, off2_rev, off1_rev), + ] + + def __repr__(self) -> str: + return f"Recombinase(site1={self.site1}, site2={self.site2}, site1_name={self.site1_name}, site2_name={self.site2_name})" + + def get_reverse_recombinase( + self, site12_name: str = "site12", site21_name: str = "site21" + ) -> "Recombinase": + """Get a recombinase that does the reverse reaction. + + Parameters + ---------- + site12_name : str, optional + Label for site12 in find/annotate output. Default is "site12". + site21_name : str, optional + Label for site21 in find/annotate output. Default is "site21". + + Returns + ------- + Recombinase + A recombinase that does the reverse reaction. + + Examples + -------- + >>> from pydna.dseqrecord import Dseqrecord + >>> from pydna.recombinase import Recombinase + >>> rec = Recombinase("ATGCCCTAAaaTT", "AAaaTTTTTTTCCCT") + >>> rec.get_reverse_recombinase() + Recombinase(site1=ATGCCCTAAaaTTTTTTTCCCT, site2=AAaaTT, site1_name=site12, site2_name=site21) + """ + _, _, off1, off2 = self._configs[0] + site12 = ( + self.site1[: off1 + self._homology_len] + + self.site2[off2 + self._homology_len :] + ) + site21 = ( + self.site2[: off2 + self._homology_len] + + self.site1[off1 + self._homology_len :] + ) + return Recombinase(site12, site21, site12_name, site21_name) + + def overlap( + self, + seqx: Dseqrecord, + seqy: Dseqrecord, + limit: None = None, + ) -> list[SequenceOverlap]: + """Find overlaps between *seqx* and *seqy* mediated by the sites. + + Parameters + ---------- + seqx : Dseqrecord + The first sequence. + seqy : Dseqrecord + The second sequence. + limit : None + Not used. Present only to satisfy the + ``AssemblyAlgorithmType`` convention. + + Returns + ------- + list[SequenceOverlap] + A list of ``(start_in_x, start_in_y, length)`` tuples. + """ + matches: list[SequenceOverlap] = [] + + for site_x_re, site_y_re, off_x, off_y in self._configs: + matches_x = list(dseqrecord_finditer(site_x_re, seqx)) + if not matches_x: + continue + matches_y = list(dseqrecord_finditer(site_y_re, seqy)) + if not matches_y: + continue + + for mx, my in itertools.product(matches_x, matches_y): + # Here we wrap in case the sequence is circular and the match spans the origin, + # and the offset makes the match go beyond the origin. + matches.append( + ( + (mx.start() + off_x) % len(seqx), + (my.start() + off_y) % len(seqy), + self._homology_len, + ) + ) + + # Deduplicate while preserving order + seen: set[SequenceOverlap] = set() + unique: list[SequenceOverlap] = [] + for m in matches: + if m not in seen: + seen.add(m) + unique.append(m) + return unique + + def find(self, seq: Dseqrecord) -> dict[str, list[SimpleLocation]]: + """Find all occurrences of the recombinase sites in *seq*. + + Parameters + ---------- + seq : Dseqrecord + Sequence to search. + + Returns + ------- + dict[str, list[SimpleLocation]] + Dictionary mapping site names to lists of locations. + """ + out: dict[str, list[SimpleLocation]] = {} + for name, raw_site in [ + (self.site1_name, self.site1), + (self.site2_name, self.site2), + ]: + fwd_re = compute_regex_site(raw_site) + rev_re = compute_regex_site(reverse_complement(raw_site)) + for pattern, strand in [(fwd_re, 1), (rev_re, -1)]: + for match in dseqrecord_finditer(pattern, seq): + loc = SimpleLocation(match.start(), match.end(), strand) + loc = shift_location(loc, 0, len(seq)) + out.setdefault(name, []).append(loc) + return out + + def annotate(self, seq: Dseqrecord) -> Dseqrecord: + """Annotate *seq* with features for all occurrences of the recombinase sites. + + Parameters + ---------- + seq : Dseqrecord + Sequence to annotate. + + Returns + ------- + Dseqrecord + A copy of the sequence with added features. + """ + out_seq = copy.deepcopy(seq) + sites = self.find(out_seq) + for name, locs in sites.items(): + for loc in locs: + if not any( + f.location == loc + and f.type == "protein_bind" + and f.qualifiers.get("label", []) == [name] + for f in out_seq.features + ): + out_seq.features.append( + SeqFeature( + loc, type="protein_bind", qualifiers={"label": [name]} + ) + ) + return out_seq + + +class RecombinaseCollection: + """A collection of recombinases.""" + + def __init__(self, recombinases: list[Recombinase]): + if not isinstance(recombinases, list): + raise ValueError("recombinases must be a list of Recombinase objects") + if not all(isinstance(r, Recombinase) for r in recombinases): + raise ValueError("recombinases must be a list of Recombinase objects") + if len(recombinases) == 0: + raise ValueError("recombinases must be a non-empty list") + self.recombinases = recombinases + + def overlap(self, seqx: Dseqrecord, seqy: Dseqrecord) -> list[SequenceOverlap]: + """Find overlaps between *seqx* and *seqy* mediated by the recombinases.""" + return sum((r.overlap(seqx, seqy) for r in self.recombinases), []) + + def find(self, seq: Dseqrecord) -> dict[str, list[SimpleLocation]]: + """Find all occurrences of the recombinase sites in *seq*.""" + out = dict() + for rec in self.recombinases: + rec_sites = rec.find(seq) + for k, v in rec_sites.items(): + out.setdefault(k, []).extend(v) + return out + + def annotate(self, seq: Dseqrecord) -> Dseqrecord: + """Annotate *seq* with features for all occurrences of the recombinase sites.""" + out = seq + for rec in self.recombinases: + out = rec.annotate(out) + return out diff --git a/tests/test_module_assembly2.py b/tests/test_module_assembly2.py index 19ee9084..8f1ca2d7 100644 --- a/tests/test_module_assembly2.py +++ b/tests/test_module_assembly2.py @@ -2638,7 +2638,7 @@ def test_gateway_assembly(): # Test that greedy is being used (finds) with pytest.raises(ValueError) as e: assembly.gateway_assembly(products_LR, "LR", greedy=True) - assert "fragment 2: attB1, attL1, attR1, attP1" in str(e.value) + assert "fragment 2: attB1, attP1, attL1, attR1" in str(e.value) # Test multi site only seq1 = Dseqrecord("aaa" + attB1 + "ggg" + attB2 + "ccc", circular=True) diff --git a/tests/test_module_gateway.py b/tests/test_module_gateway.py index 3d24a3b6..402a15d7 100644 --- a/tests/test_module_gateway.py +++ b/tests/test_module_gateway.py @@ -1,4 +1,5 @@ -from pydna.gateway import gateway_overlap, find_gateway_sites +# -*- coding: utf-8 -*- +from pydna.gateway import annotate_gateway_sites, gateway_overlap, find_gateway_sites import pydna.assembly2 as assembly import glob from pydna.parsers import parse_snapgene @@ -119,3 +120,9 @@ def test_find_gateway_sites(): assert find_gateway_sites(seq, greedy=False) == { "attB3": [SimpleLocation(0, 21, 1), SimpleLocation(24, 45, 1)] } + + +def test_annotate_gateway_sites(): + seq = Dseqrecord("CAACTTTGTATACAAAAGTTGaaaCAACTTTGTATAATAAAGTTG") + annotated_seq = annotate_gateway_sites(seq, greedy=False) + assert len(annotated_seq.features) == 2 diff --git a/tests/test_module_recombinase.py b/tests/test_module_recombinase.py new file mode 100644 index 00000000..b2984bb5 --- /dev/null +++ b/tests/test_module_recombinase.py @@ -0,0 +1,354 @@ +# -*- coding: utf-8 -*- +import pytest +from Bio.SeqFeature import SimpleLocation +from pydna.dseqrecord import Dseqrecord +from pydna.dseq import Dseq +from pydna.assembly2 import ( + Assembly, + recombinase_integration, + recombinase_excision, +) +from pydna.recombinase import ( + _recombinase_homology_offset_and_length, + Recombinase, + RecombinaseCollection, +) + + +# --------------------------------------------------------------------------- +# _recombinase_homology_offset_and_length +# --------------------------------------------------------------------------- + + +def test_recombinase_homology_offset_and_length_basic(): + site1 = "ATGCCCTAAaaTT" # lowercase 'aa' at positions 9-10 + site2 = "AAaaTTTTTTTCCCT" # lowercase 'aa' at positions 2-3 + + off1, len1 = _recombinase_homology_offset_and_length(site1) + off2, len2 = _recombinase_homology_offset_and_length(site2) + + assert (off1, len1) == (9, 2) + assert (off2, len2) == (2, 2) + + +def test_recombinase_homology_offset_and_length_wrong_pattern_raises(): + for pattern in ["ATGCCCTAAATT", "ATGaa", "aaATG", "ATGaaCCaa", "ATGaaCCaaTT"]: + with pytest.raises(ValueError): + _recombinase_homology_offset_and_length(pattern) + + +def test_recombinase_homology_offset_and_length_invalid_chars(): + with pytest.raises(ValueError): + _recombinase_homology_offset_and_length("ATGaa123") + with pytest.raises(ValueError): + _recombinase_homology_offset_and_length("ATGaa-CC") + + +# --------------------------------------------------------------------------- +# Recombinase.overlap – basic (linear, exact) +# --------------------------------------------------------------------------- + + +def test_recombinase_init_errors(): + # Different cores + with pytest.raises(ValueError) as e: + Recombinase("ATGacCTAAATT", "AAaaTTTTTTTCCCT") + assert "Recombinase recognition sites do not have matching" in str(e.value) + + +def test_recombinase_overlap_single_match(): + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + + rec = Recombinase(site1, site2) + matches = rec.overlap(seqA, seqB, limit=None) + assert matches == [(12, 6, 2)] + + +def test_recombinase_overlap_no_site_returns_empty(): + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + seqB = Dseqrecord("tataAAccTTTTTTTCCCTaaa") + + rec = Recombinase(site1, site2) + matches = rec.overlap(seqA, seqB, limit=None) + assert matches == [] + + +def test_recombinase_algorithm_with_assembly(): + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + seqA = Dseqrecord("aaaATGCCCTAAaaTTtt") + seqB = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + + rec = Recombinase(site1, site2) + asm = Assembly([seqA, seqB], algorithm=rec.overlap, use_fragment_order=False) + products = asm.assemble_linear() + + assert len(products) == 2 + assert str(products[0].seq) == "aaaATGCCCTAAaaTTTTTTTCCCTaaa" + assert str(products[1].seq) == "tataAAaaTTtt" + + +# --------------------------------------------------------------------------- +# Circular sequences +# --------------------------------------------------------------------------- + + +def test_circular_site_spanning_origin(): + """Sites that span the origin of a circular sequence should be found.""" + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + circ_seq = Dseqrecord("aaTTgggggATGCCCTAA", circular=True) + linear_seq = Dseqrecord("tataAAaaTTTTTTTCCCTaaa") + + rec = Recombinase(site1, site2) + for shift in range(len(circ_seq)): + circ_seq_shifted = circ_seq.shifted(shift) + matches = rec.overlap(circ_seq_shifted, linear_seq, limit=None) + assert len(matches) == 1 + start, _, length = matches[0] + assert ( + str(circ_seq_shifted.seq[start : (start + length) % len(circ_seq_shifted)]) + == "aa" + ) + + +def test_circular_both_sequences(): + """Both sequences circular, sites should still be found.""" + site1 = "GGGaaaCCC" + site2 = "TTaaaTT" + + circ_x = Dseqrecord("aaaCCCcaGGG", circular=True) + circ_y = Dseqrecord("aaaTTgaTT", circular=True) + + rec = Recombinase(site1, site2) + matches = rec.overlap(circ_x, circ_y, limit=None) + assert len(matches) == 1 + assert matches[0] == (0, 0, 3) + + +# --------------------------------------------------------------------------- +# Degenerate sequences +# --------------------------------------------------------------------------- + + +def test_degenerate_site(): + """Sites with IUPAC ambiguity codes should match concrete sequences.""" + site1 = "ATGNNNaaTT" + site2 = "CCNNaaTTGG" + + seqA = Dseqrecord("xxxATGCCCaaTTyyy") + seqB = Dseqrecord("xxxCCGGaaTTGGyyy") + + rec = Recombinase(site1, site2) + matches = rec.overlap(seqA, seqB, limit=None) + assert len(matches) == 1 + assert matches[0] == (9, 7, 2) + + +# --------------------------------------------------------------------------- +# Reverse complement +# --------------------------------------------------------------------------- + +# TODO: check tests below + + +def test_reverse_complement_sites(): + """Sites on the reverse strand should be detected.""" + from Bio.Seq import reverse_complement + + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + + rc_site1 = reverse_complement(site1) + rc_site2 = reverse_complement(site2) + seqA = Dseqrecord(f"ggg{rc_site1}ggg") + seqB = Dseqrecord(f"ttt{rc_site2}ttt") + + rec = Recombinase(site1, site2) + matches = rec.overlap(seqA, seqB, limit=None) + assert len(matches) == 1 + assert matches[0] == (5, 14, 2) + + +# --------------------------------------------------------------------------- +# Site annotation +# --------------------------------------------------------------------------- + + +def test_find_recombinase_sites(): + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + rec = Recombinase(site1, site2, site1_name="s1", site2_name="s2") + seq = Dseqrecord(f"ggg{site1.upper()}ttt") + sites = rec.find(seq) + assert "s1" in sites + assert sites["s1"] == [SimpleLocation(3, 3 + len(site1), 1)] + + # Works origin-spanning sites + seq_looped = seq.looped() + for shift in range(len(seq_looped)): + seq_looped_shifted = seq_looped.shifted(shift) + sites = rec.find(seq_looped_shifted) + assert str(sites["s1"][0].extract(seq_looped_shifted.seq)) == site1.upper() + + # Works with multiple sites + seq = Dseqrecord(f"ggg{site1.upper()}ttt{site2.upper()}ttt{site1.upper()}ttt") + sites = rec.find(seq) + assert len(sites["s1"]) == 2 + assert len(sites["s2"]) == 1 + + +def test_annotate_recombinase_sites(): + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + rec = Recombinase(site1, site2, site1_name="mysite1", site2_name="mysite2") + seq = Dseqrecord(f"ggg{site1.upper()}ttt") + + annotated_seq = rec.annotate(seq) + assert len(annotated_seq.features) == 1 + assert annotated_seq.features[0].qualifiers.get("label", []) == ["mysite1"] + assert annotated_seq.features[0].location == SimpleLocation(3, 3 + len(site1), 1) + + # Does not change original sequence + assert len(seq.features) == 0 + + +# --------------------------------------------------------------------------- +# recombinase_integration and recombinase_excision +# --------------------------------------------------------------------------- + + +def test_recombinase_integration(): + """Integration of insert into genome via recombinase sites.""" + site1 = "ATGCCCTAAaaCT" + site2 = "CAaaTTTTTTTCCCT" + + genome = Dseqrecord(f"cccccc{site1.upper()}aaaaa") + insert = Dseqrecord(f"{site2.upper()}bbbbb", circular=True) + rec = Recombinase(site1, site2) + + products = recombinase_integration(genome, [insert], rec) + + assert len(products) == 1 + assert len(products[0].features) == 0 + assert str(products[0].seq) == "ccccccATGCCCTAAAATTTTTTTCCCTbbbbbCAAACTaaaaa" + + +def test_recombinase_excision(): + """Excision of plasmid from genome with two recombinase sites.""" + site1 = "ATGCCCTAAaaCT" + site2 = "AAaaTTTTTTTCCCT" + + rec = Recombinase(site1, site2) + genome = Dseqrecord(f"cccccc{site1.upper()}tttt{site2.upper()}aaaaa") + + products = recombinase_excision(genome, rec) + assert len(products) == 2 + assert products[0].seq.upper() == Dseq("AACTttttAA".upper()) + assert ( + products[1].seq.upper() + == Dseq("ccccccATGCCCTAAAATTTTTTTCCCTaaaaa", circular=True).upper() + ) + + +def test_recombinase_integration_excision_reversibility(): + """With same site on both sides (Cre-Lox style), integrate then excise returns originals.""" + site = "ATGaaGTA" + + genome = Dseqrecord(f"cccccc{site.upper()}aaaaa") + insert = Dseqrecord(f"{site.upper()}bbbbb", circular=True) + rec = Recombinase(site, site) + + products = recombinase_integration(genome, [insert], rec) + assert len(products) == 1 + integrated = products[0] + + excised = recombinase_excision(integrated, rec) + assert len(excised) == 2 + + assert excised[1].seq.seguid() == genome.seq.seguid() + assert excised[0].seq.seguid() == insert.seq.seguid() + + +def test_recombinase_reverse(): + site1 = "ATGCCCTAAaaTT" + site2 = "AAaaTTTTTTTCCCT" + rec = Recombinase(site1, site2) + rev_rec = rec.get_reverse_recombinase() + assert rev_rec.site1 == "ATGCCCTAAaaTTTTTTTCCCT" + assert rev_rec.site2 == "AAaaTT" + assert rev_rec.site1_name == "site12" + assert rev_rec.site2_name == "site21" + + +# --------------------------------------------------------------------------- +# RecombinaseCollection +# --------------------------------------------------------------------------- + + +def test_recombinase_collection(): + site1 = "AAaaTTC" + site2 = "CCaaTTC" + site3 = "GAccACC" + site4 = "TCccAAC" + rec1 = Recombinase(site1, site2, site1_name="s1", site2_name="s2") + rec2 = Recombinase(site3, site4, site1_name="s3", site2_name="s4") + collection = RecombinaseCollection([rec1, rec2]) + seq1 = Dseqrecord(f"ggg{site1.upper()}ttt{site3.upper()}ttt") + seq2 = Dseqrecord(f"gggc{site2.upper()}ttt{site4.upper()}ttt") + print(collection.overlap(seq1, seq2)) + assert collection.overlap(seq1, seq2) == [(5, 6, 2), (15, 16, 2)] + + +def test_recombinase_collection_find(): + # Here the important thing to test is that if + # two recombinases have the same site name, the find method should return + # both locations. + site1 = "AAaaTTC" + site2 = "CCaaTTC" + site3 = "GAccACC" + site4 = "TCccAAC" + rec1 = Recombinase(site1, site2, site1_name="s1", site2_name="s2") + rec2 = Recombinase(site3, site4, site1_name="s1", site2_name="s2") + collection = RecombinaseCollection([rec1, rec2]) + seq = Dseqrecord(f"ggg{site1.upper()}ttt{site3.upper()}ttt") + assert len(collection.find(seq)["s1"]) == 2 + + +def test_recombinase_collection_annotate(): + # Same as above, checking that two sites with the same name are annotated. + site1 = "AAaaTTC" + site2 = "CCaaTTC" + site3 = "GAccACC" + site4 = "TCccAAC" + rec1 = Recombinase(site1, site2, site1_name="s1", site2_name="s2") + rec2 = Recombinase(site3, site4, site1_name="s1", site2_name="s2") + collection = RecombinaseCollection([rec1, rec2]) + seq = Dseqrecord(f"ggg{site1.upper()}ttt{site3.upper()}ttt") + annotated_seq = collection.annotate(seq) + assert len(annotated_seq.features) == 2 + assert annotated_seq.features[0].qualifiers.get("label", []) == ["s1"] + assert annotated_seq.features[1].qualifiers.get("label", []) == ["s1"] + + # It does not re-annotate if the sequence is already annotated. + annotated_seq = collection.annotate(annotated_seq) + assert len(annotated_seq.features) == 2 + + +def test_recombinase_collection_init_errors(): + # Check that the init method raises errors for invalid inputs. + with pytest.raises(ValueError): + RecombinaseCollection(None) + with pytest.raises(ValueError): + RecombinaseCollection([]) + with pytest.raises(ValueError): + RecombinaseCollection([Recombinase("AAaaTTC", "CCaaTTC"), "blah"])