From 6a30420308da617afaecfa71b3e4dac72f2449a5 Mon Sep 17 00:00:00 2001 From: LCrossman Date: Tue, 9 Dec 2025 11:49:45 +0000 Subject: [PATCH 1/2] kh3rld regex speedup commit --- microBioRust/new_output_embl.gbk | 52 +++++++++++++++++++++++++++++++ microBioRust/new_output_embl.gff | 21 +++++++++++++ microBioRust/test_output.gff | 6 ++++ microBioRust/test_output_embl.gff | 8 +++++ 4 files changed, 87 insertions(+) create mode 100644 microBioRust/new_output_embl.gbk create mode 100644 microBioRust/new_output_embl.gff create mode 100644 microBioRust/test_output.gff create mode 100644 microBioRust/test_output_embl.gff diff --git a/microBioRust/new_output_embl.gbk b/microBioRust/new_output_embl.gbk new file mode 100644 index 0000000..ab66c41 --- /dev/null +++ b/microBioRust/new_output_embl.gbk @@ -0,0 +1,52 @@ +LOCUS source_1 928 bp DNA linear CON 24-NOV-2025 +DEFINITION Escherichia coli K-12 substr. MG1655. +ACCESSION source_1 +KEYWORDS . +SOURCE Escherichia coli K-12 substr. MG1655 + ORGANISM Escherichia coli K-12 substr. MG1655 +FEATURES Location/Qualifiers + source 1..910 + /organism="K-12 substr. MG1655" + /mol_type="DNA" + /strain="K-12 substr. MG1655" + /db_xref="PRJNA57779" + gene complement(1..354) + /locus_tag="b3304" + CDS complement(1..354) + /locus_tag="b3304" + /codon_start="1" + /gene="rplR" + /translation="MDKKSARIRRATRARRKLQELGATRLVVHRTPRHIYAQVIAPNGS + LVAASTVEKAIAEQLKYTGNKDAAAAVGKAVAERALEKGIKDVSFDRSGFQYHGRVQAL + DAAREAGLQ" + /product="50S ribosomal subunit protein L18" + gene complement(364..897) + /locus_tag="b3305" + CDS complement(364..897) + /locus_tag="b3305" + /codon_start="1" + /gene="rplF" + /translation="MSRVAKAPVVVPAGVDVKINGQVITIKGKNGELTRTLNDAVEVKH + NTLTFGPRDGYADGWAQAGTARALLNSMVIGVTEGFTKKLQLVGVGYRAAVKGNVINLS + GFSHPVDHQLPAGITAECPTQTEIVLKGADKQVIGQVAADLRAYRRPEPYKGKGVRYAD + VVRTKEAKK" + /product="50S ribosomal subunit protein L6" +ORIGIN + 1 acctctacct tagaactgaa ggccagcttc acgggcagca tctgccagtg cctggacacg + 61 accatgatat tggaacccgg aacggtcaaa ggatacatct ttgatgcctt tttccagagc + 121 gcgttcagcg acagctttac ccacagctgc agccgcgtct ttgttaccgg tgtacttcag + 181 ttgttcagcg atagcttttt ctacagtaga agcagctacc agaacttcag aaccgttcgg + 241 tgcaattacc tgtgcgtaaa tgtgacgcgg ggtacgatgt accaccaggc gagttgcgcc + 301 cagctcctgg agcttgcggc gtgcgcgggt cgcacgacgg atacgagcag atttcttatc + 361 catagtgtta ccttacttct tcttagcctc tttggtacgc acgacttcgt cggcgtaacg + 421 aacacccttg cctttataag gctcaggacg acggtaggcg cgcagatccg ctgcaacctg + 481 gccgatcacc tgcttatcag cgcctttcag cacgatttca gtctgagtcg gacattcagc + 541 agtgataccc gcaggcagct gatggtcaac aggatgagag aaacccagag acaggttaat + 601 cacattgcct ttaaccgctg cacggtaacc tacaccaacc agctgcagct tcttagtgaa + 661 gccttcggta acaccgataa ccattgagtt cagcagggca cgcgcggtac cagcctgtgc + 721 ccaaccgtct gcgtaaccat cacgcggacc gaaggtcagg gtattatctg catgtttaac + 781 ttcaacagca tcgttgagag tacgagtcag ctcgccgttt ttacctttga tcgtaataac + 841 ctgaccgttg atttttacgt caacgccggc aggaacaacg accggtgctt tagcaacacg + 901 agacattttt tcc + +// diff --git a/microBioRust/new_output_embl.gff b/microBioRust/new_output_embl.gff new file mode 100644 index 0000000..250c63b --- /dev/null +++ b/microBioRust/new_output_embl.gff @@ -0,0 +1,21 @@ +##gff-version 3 +##sequence-region source_1 1 910 +source_1 . CDS 1 354 0 - 0 id=b3304;name=source_1;gene=rplR;locus_tag=b3304;product=50S ribosomal subunit protein L18 +source_1 . CDS 364 897 0 - 0 id=b3305;name=source_1;gene=rplF;locus_tag=b3305;product=50S ribosomal subunit protein L6 +##FASTA +acctctaccttagaactgaaggccagcttcacgggcagcatctgccagtgcctggacacg +accatgatattggaacccggaacggtcaaaggatacatctttgatgcctttttccagagc +gcgttcagcgacagctttacccacagctgcagccgcgtctttgttaccggtgtacttcag +ttgttcagcgatagctttttctacagtagaagcagctaccagaacttcagaaccgttcgg +tgcaattacctgtgcgtaaatgtgacgcggggtacgatgtaccaccaggcgagttgcgcc +cagctcctggagcttgcggcgtgcgcgggtcgcacgacggatacgagcagatttcttatc +catagtgttaccttacttcttcttagcctctttggtacgcacgacttcgtcggcgtaacg +aacacccttgcctttataaggctcaggacgacggtaggcgcgcagatccgctgcaacctg +gccgatcacctgcttatcagcgcctttcagcacgatttcagtctgagtcggacattcagc +agtgatacccgcaggcagctgatggtcaacaggatgagagaaacccagagacaggttaat +cacattgcctttaaccgctgcacggtaacctacaccaaccagctgcagcttcttagtgaa +gccttcggtaacaccgataaccattgagttcagcagggcacgcgcggtaccagcctgtgc +ccaaccgtctgcgtaaccatcacgcggaccgaaggtcagggtattatctgcatgtttaac +ttcaacagcatcgttgagagtacgagtcagctcgccgtttttacctttgatcgtaataac +ctgaccgttgatttttacgtcaacgccggcaggaacaacgaccggtgctttagcaacacg +agacattttttcc diff --git a/microBioRust/test_output.gff b/microBioRust/test_output.gff new file mode 100644 index 0000000..f155761 --- /dev/null +++ b/microBioRust/test_output.gff @@ -0,0 +1,6 @@ +##gff-version 3 +##sequence-region source_NC_000913_1 1 913 +source_NC_000913_1 . CDS 10 363 0 - 0 id=b3304;name=source_NC_000913_1;gene=rplR;locus_tag=b3304;product=50S ribosomal subunit protein L18 +source_NC_000913_1 . CDS 373 906 0 - 0 id=b3305;name=source_NC_000913_1;gene=rplF;locus_tag=b3305;product=50S ribosomal subunit protein L6 +##FASTA +acctctaccttagaactgaaggccagcttcacgggcagcatctgccagtgcctggacacgaccatgatattggaacccggaacggtcaaaggatacatctttgatgcctttttccagagcgcgttcagcgacagctttacccacagctgcagccgcgtctttgttaccggtgtacttcagttgttcagcgatagctttttctacagtagaagcagctaccagaacttcagaaccgttcggtgcaattacctgtgcgtaaatgtgacgcggggtacgatgtaccaccaggcgagttgcgcccagctcctggagcttgcggcgtgcgcgggtcgcacgacggatacgagcagatttcttatccatagtgttaccttacttcttcttagcctctttggtacgcacgacttcgtcggcgtaacgaacacccttgcctttataaggctcaggacgacggtaggcgcgcagatccgctgcaacctggccgatcacctgcttatcagcgcctttcagcacgatttcagtctgagtcggacattcagcagtgatacccgcaggcagctgatggtcaacaggatgagagaaacccagagacaggttaatcacattgcctttaaccgctgcacggtaacctacaccaaccagctgcagcttcttagtgaagccttcggtaacaccgataaccattgagttcagcagggcacgcgcggtaccagcctgtgcccaaccgtctgcgtaaccatcacgcggaccgaaggtcagggtattatctgcatgtttaacttcaacagcatcgttgagagtacgagtcagctcgccgtttttacctttgatcgtaataacctgaccgttgatttttacgtcaacgccggcaggaacaacgaccggtgctttagcaacacgagacattttttcc diff --git a/microBioRust/test_output_embl.gff b/microBioRust/test_output_embl.gff new file mode 100644 index 0000000..4c7ab49 --- /dev/null +++ b/microBioRust/test_output_embl.gff @@ -0,0 +1,8 @@ +##gff-version 3 +##sequence-region source_AM236082_1 1 6666 +source_AM236082_1 . CDS 1 1197 0 + 0 id=pRL80001;name=source_AM236082_1;gene=repAp8;locus_tag=pRL80001;product=replication protein RepA +source_AM236082_1 . CDS 1321 2280 0 + 0 id=pRL80002;name=source_AM236082_1;gene=repBp8;locus_tag=pRL80002;product=replication protein RepB +source_AM236082_1 . CDS 2455 3672 0 + 0 id=pRL80003;name=source_AM236082_1;gene=repCp8;locus_tag=pRL80003;product=replication RepC protein +source_AM236082_1 . CDS 3811 6666 0 + 0 id=pRL80004;name=source_AM236082_1;locus_tag=pRL80004;product=hypothetical protein +##FASTA +gtggagaatcccgctcagcttcagaaggctattcataaactgatagcggcccacgcgcgagatctctcgggcgcgcttcacgagcatcgtgtgaagctttatccgcctgaagctcgaaagacgcttcggtcattttcgtcgatagaggctgcgaagctcattggcgtcaacgatggctatctccgccatctttcgctcgagggtaaggggccgcagcctgagatcggaaataacaatcgccgttcgtattcggtcgagactattcaggcgctccgcgagtatctcgacgagaacggcaagggtgaccgtcggtactcaccacgccggagcggtcgtgagcatttgcaggttataaccgcagtgaacttcaagggaggcagcggtaagaccacgacggctgctcatcttgctcagtatcttgcgcttaatggataccgggttcttgcgattgatcttgatccgcaggccagcatgtccgctttgcacggattccagcctgagtttgacgttggcgacaacgaaacgctctacggcgccgttcgttatgatgaagagcggcgcccgctgaaggatataatcaagaaaacctactttgcgaaccttgatctcgttccgggcaacctcgagcttatggaattcgagcacgacaccgctaaagtgctcggctctaacgaccgcaagaacatcttcttcacgcgaatggatgacgcaatcgcgtcagtggcggacgactatgacgttgtcgtcgtcgactgccctccccagctcggctttctgacgatctcggctctatgcgcggcaaccgccgttcttgttactgtacatcctcagatgctcgatgtgatgtcgatgtgccagtttctgctgatgacctcagaacttctgagcgtcgttgcggatgctggcgggagcatgaactacgattggatgcgttatctcgttacgcgctacgagccgggagacggaccgcaaaaccagatggtgtcgttcatgcgcacgatgtttggcgaccatgtcctgaaccacccgatgctcaagagcacagccatttcagacgcggggattactaagcagactctctatgaggtgagccgcgaccagttcacgcgagcaacatacgaccgagccatggaatcgctcgacaacgtgaacagcgaaatcgaacaactcattcaatcatcttggggtcgcaaatgatggctctagagatctcagaaaacgcgacattgatggagaagttgccagccggaaacttttcggaatttgcactctctatgtcgaggaatccggcttgtcacgagtacctcaggggaaagcaagatggctagaaaacacctcctttcagatttgaaagctcctgcttcatcatctacggagttcgatgaagctagggctgcagacgtccctactccgcagtatgcgcctcgaggtgcaatcggtgccgtctcgcgatcgattgaagctttgaagtcgcagggactgagtgaactcgatcccgaactgatagatgcgccgtccgttactgatcgccttgatgaggatggggctcagtttgaggagttcgctcgcaacatccgtgagaatgggcagcaggttccgattcttgtccggcctcacccgaccgtggaaggacggtatcagattgcctacggccggagacggttgagagcggtcaaggcggccggcctcaaggtcaaagccgcaatcagaaatctgacagatgacgagcttgtactggcgcaaggtcaggaaaacagcgcgcgtcaggatctgtcgtttatcgagcgggcgctctatgcagcccagctcgaagcgagtggctaccagcgtcccgtcatcatggcagcgctggctgtcgacaaaagtaacctttcgcggttgattcaggctgcgacccaattgccggacgacgtcatccgactaattggtgctgcgcctaagaccggccgtgatcgctggtacgagctatcatcgcggttggctgcagaaggtgctgcggagaaggcgcgcgctcttctttcgactagcgaggttggctccctgggttctgatgagcgatttgttcgcgttttcgacgcggttgcgccgaagaaatctaagaaggaaaaagttcaggcggatgtctggcaagctgacgatggggtcaaggctgcgagtttccgccaggacaaacgaacactgacattgatgatcgacaagaaggcagcgccggaattcggtgagtacctgatgtcggctctccccgagatctacgcttcgttcaagaagtcgaagcaatagatgagtcgtaacgaagaaaggtgccgatagcgcaaagaaaaagccctccgaaacggtgttccagaaggcctctctcagtttggtcgcttagagaatcgcatttcccggaatcacagtcaagagtcaacgccacaccggcgtagccttttctttgccttgcgaaaggtgaaggacatggaaacgggttatatcacgacgccctttgggcggcggccgatgacgcttgctctggtgaagcgtcaggttaagaccgagcaggcaatagcggatggctcggtcgacaagtggcgcgtgtttcgcgacataagcgacgcccgctcacgccttggccttcaagatcgagccttggcggtcttgaatgcacttttaacattcttcccagttgctgaactcagcaatgagaggaacctggtcgtctttccatcaaatgctcagctatcagcccgcacaaacggtatcgctgggacaactctgcgcaagtgcctcggttcgctggtggaggccggtgtaatcatccgcaaggatagccctaacggtaagcgatatgctcgaaaaggcaaagaaggaaacatagaggacgcctacggcttcagtctggcaccgcttcttgcgcgcgccggcgagtttgctagcctcgcccaagacgtggctgctgaacagcgccgcttccgcatcacgaaagaccgcctcacgatcgttcggcgagatgtccgcaagctgatcaccgtcgggatggaagagaaccttgccggcgattggattgccgcggaaacgtgctttgtcgagattgtgggaaggttcgttcggcacccgacgctccaggacctgatttcgagcctcgacgagatgagccttcttcacgaagaagtctccaggatgctggaaattaaagaagaaaccgcaaaaagtgatggcaatgccatcccggacggatgccacatacagaattcaaataccgaatcctgccatgaacttgaaccccgctccgaaaagaagcagggcgaaaagtccgagccaaacaagaaaacggagcggaaagacgaaccggaagcgtttccgttgtccatggtgttgcgtgcctgcccggagatcaacgcatttggccctggtggatcgattggaagctggcgcgaaatgatgtcagcggcggtaacggttcggtccatgcttggcgtcagcccctctgcctatcaggaggcatgcgaggtgatggggcaggccggagcggcgatagcaatagcttgcatttaccagcgtggcgggcacatcaactcggcggggggatatcttcgggatctaacggggaaggcgcggcgaggggagttttcacttgggccaatgctgtttacgcaattgcgggcgaactcgggcaccgtcaaggcgtcagcgtaggtcaaagtatcatgattgtttagcctaaccggttgaactaattaacctattttgactagtttccggctggcaactttatctcgatctaaagcgtcgagtgaatggcagaagataatcttcctgatgggcgtccgtataatgaccgaaattgtgcttccgaccgaaaacacgatcatcgcggcagccaaaaaacttgacgcggccgcatcgcagctggtggcagagacgttctttgccattcggcatgggatgtcaatcaatccaattggtcgcaacccggatgggcagaccatcaagggataccctgacattactgggcgggtgccgggtgagaagaagtacctgatcgaagtcacgaaggacgactggcgcacacatcttcagagcgatctatcaaaactgtcccgcctgcagaaaggagcctacgcgggtttcctacttctctgcttccgaaagtccgagtccgaactcactcaaagcaacaggaagaaggcacgggaaaccgtccagcaggccgagagccggattgaaaagcttttgggtgtccaggcaggacaggtagaattcgtctttcttggcgagttcgcgcgtgaggtcagatcggcgaaataccaccgcgtattgctggctctgggtctcgagcttgtgccagcgccattctacacggatttgcgcttcgtgcagggcttagccgatttcgtaccgaccgctgaggaatatgaggctgagagtgttgttcctcgcgatgaggtaagccggacctatgagcgggtcttcaaaaacagactaacgttgatcgaaggcgagggcggtagcggcaaaacaagcctggccctagccgttgcgacggagcatcggaagcaaggcgagatctttctgttcttagacgcctctgtcgctgactggaagagcggttcggagcgagctcgcctcgttgacgtagcggcgatgttcgcggaatcgaatgtcctgattatattggacaacgtacatctgggcgatgcgtccggcatttctgaactgattacaaatgtccaggcgtccggttatgatttccgctttttgatgacgacgcgcagcagcgacgaagttgaacaatggaagcgcctgggaaatatcgagcttctccgcagagttccgtctggagccgatgtcaactctgcctatcaccgcctgctcactcaaaagtttcccggaagcagtttcaacgatattcccccagcggtgaccacacgatggtcaaatcaaattcccaatctggttattctcacgcttgctcttgaaggtctcacaaagagaggcggctatgatcgcgattgggcgatcaaggttgaggacgcaggcacataccttcaagctaagttcatctcgaagctgtcgtccgacgacgtcaaacaggtgggcaagatcgctgcgctctcacttctggaaattcccacctcgctcaggtcgctcgaccaccgggttccaaagtctgctgtggatctgggcttcgttcgtctgaactcgagttcaacaactcagcgatatgagctcgttcaccacgaactgggcaagctgatcacgtccttcaaagatccggatatcaaggcgcggctgggagaggtgatgtccgctgatcccttccaggcaacatatatcgggctgaagcttatcggaaacggagaagccagcctggcaaaggaattgttgtcgtcagtcctttctcaatcactcacactctcgccagatttctcgatgggaaactccggcggagtcttcggtatcctggtccagtccaacgtgactacctatcccgaaattgagcgtatccttcttcctgatatcggcgcctttttcgatacaaagccggatattgtaaccggccttagctccttcctcggggctgcctccgaaaacatggagcgcgtatacaatgccattgtggaaaaacttgccgaacaggaaacgattcgacggatcgaagagcttctcccatccgtcggcccgacgactttcgcgacactttaccgatgcgcgaactcacggaacctcccgtttctttcaacgcttcgaaaatatctcaacagagggaagcgtatagattcctttgcctatcgatgcaggtctgaaagtccgagtaaggtcgagatctgctggggcctgattgatgagttctttccacaccacaaggcccggtttgaagttgtgcttcgctctgccctcgccgagggatacatcgagcgccttatcccggaagagcttattgagtctcgctcttcaagggctgttcagacggcgatccgatgcgcaaatagcgaagttttcaaacggtacatcacgttccgtgactgcagcgacgcgacgctgttgcttctggcccacacgatgcacgacatgggcaggaatgatctctcggaggtcgcagctgaccgagttgcaggcaggacgacctcttcaatctggtatcatcgtcgcaccggtggcagggcgttgctgactattttgcggagagcatcgatatctgcagaaggagatgttcagaaaattctgatgcggcttgaggctgaaggaaaaatgagggccattgtgaatggaatgcggccttatcgcctagcgaattttattttcgtgatctgggatcggcacgagcaatttacttcattcatctcgaagacagatcttcaggaaattacaaaccgccggttcaaagcgcgagcggcagagttctctgaagagcgacaagcgtccatctacattgcaggaatctatgcgctggtaggcctcgacataccgcgggacgagtggagcgcggtcgacgtcactgaagacgatttcattggaaaccagaacaacccggtcttctggatcggtctcaaggctctggaagaaaatggcatgatacgccttgcccatcgaagcagatttccgacatctgtcgcggcgctagatactcattcggaaaacaccagccggatcatgaacgatttgaaaaactgggctgcgaccaggtaa From c1769d8937283be527d33982851fa8ce7b282872 Mon Sep 17 00:00:00 2001 From: LCrossman Date: Tue, 9 Dec 2025 11:55:52 +0000 Subject: [PATCH 2/2] kh3rld regex speedup commit --- microBioRust/src/gbk.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/microBioRust/src/gbk.rs b/microBioRust/src/gbk.rs index 4125a71..3438b97 100644 --- a/microBioRust/src/gbk.rs +++ b/microBioRust/src/gbk.rs @@ -1,6 +1,7 @@ //! # A Genbank to GFF parser //! //! +//! //! You are able to parse genbank and save as a GFF (gff3) format as well as extracting DNA sequences, gene DNA sequences (ffn) and protein fasta sequences (faa) //! //! You can also create new records and save as a genbank (gbk) format