@@ -632,24 +632,52 @@ def protein_sequences() -> Dict[str, str]:
632632 ),
633633 }
634634
635+ @staticmethod
636+ def proteins_for_pretraining () -> List [str ]:
637+ """
638+ Returns a list of protein IDs which will be used for pretraining based on mock UniProt data.
639+
640+ Proteins include those with:
641+ - No GO classes or invalid GO classes (missing required evidence codes).
642+
643+ Returns:
644+ List[str]: A list of protein IDs that do not meet validation criteria.
645+ """
646+ return [
647+ "Swiss_Prot_5" , # No GO classes associated
648+ "Swiss_Prot_6" , # GO class with no evidence code
649+ "Swiss_Prot_7" , # GO class with invalid evidence code
650+ ]
651+
635652 @staticmethod
636653 def get_UniProt_raw_data () -> str :
637654 """
638655 Get raw data in string format for UniProt proteins.
639656
640- This mock data contains six Swiss-Prot proteins with different properties:
641- - Swiss_Prot_1 and Swiss_Prot_2 are valid proteins.
642- - Swiss_Prot_3 has a sequence length greater than 1002.
643- - Swiss_Prot_4 contains "X", a non-valid amino acid in its sequence.
644- - Swiss_Prot_5 has no GO IDs mapped to it.
645- - Swiss_Prot_6 has GO IDs mapped, but no evidence codes.
657+ This mock data contains eleven Swiss-Prot proteins with different properties:
658+ - **Swiss_Prot_1**: A valid protein with three valid GO classes and one invalid GO class.
659+ - **Swiss_Prot_2**: Another valid protein with two valid GO classes and one invalid.
660+ - **Swiss_Prot_3**: Contains valid GO classes but has a sequence length > 1002.
661+ - **Swiss_Prot_4**: Has valid GO classes but contains an invalid amino acid, 'X'.
662+ - **Swiss_Prot_5**: Has a sequence but no GO classes associated.
663+ - **Swiss_Prot_6**: Has GO classes without any associated evidence codes.
664+ - **Swiss_Prot_7**: Has a GO class with an invalid evidence code.
665+ - **Swiss_Prot_8**: Has a sequence length > 1002 and has only invalid GO class.
666+ - **Swiss_Prot_9**: Has no GO classes but contains an invalid amino acid, 'X', in its sequence.
667+ - **Swiss_Prot_10**: Has a valid GO class but lacks a sequence.
668+ - **Swiss_Prot_11**: Has only Invalid GO class but lacks a sequence.
669+
670+ Note:
671+ A valid GO label is the one which has one of the following evidence code
672+ (EXP, IDA, IPI, IMP, IGI, IEP, TAS, IC).
646673
647674 Returns:
648675 str: The raw UniProt data in string format.
649676 """
650677 protein_sq_1 = GOUniProtMockData .protein_sequences ()["Swiss_Prot_1" ]
651678 protein_sq_2 = GOUniProtMockData .protein_sequences ()["Swiss_Prot_2" ]
652679 raw_str = (
680+ # Below protein with 3 valid associated GO class and one invalid GO class
653681 f"ID Swiss_Prot_1 Reviewed; { len (protein_sq_1 )} AA. \n "
654682 "AC Q6GZX4;\n "
655683 "DR GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n "
@@ -659,6 +687,7 @@ def get_UniProt_raw_data() -> str:
659687 f"SQ SEQUENCE { len (protein_sq_1 )} AA; 29735 MW; B4840739BF7D4121 CRC64;\n "
660688 f" { protein_sq_1 } \n "
661689 "//\n "
690+ # Below protein with 2 valid associated GO class and one invalid GO class
662691 f"ID Swiss_Prot_2 Reviewed; { len (protein_sq_2 )} AA.\n "
663692 "AC DCGZX4;\n "
664693 "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n "
@@ -668,34 +697,17 @@ def get_UniProt_raw_data() -> str:
668697 f"SQ SEQUENCE { len (protein_sq_2 )} AA; 29735 MW; B4840739BF7D4121 CRC64;\n "
669698 f" { protein_sq_2 } \n "
670699 "//\n "
671- "ID Swiss_Prot_3 Reviewed; 1165 AA.\n "
700+ # Below protein with all valid associated GO class but sequence length greater than 1002
701+ f"ID Swiss_Prot_3 Reviewed; { len (protein_sq_1 * 25 )} AA.\n "
672702 "AC Q6GZX4;\n "
673703 "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n "
674704 "DR GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n "
675705 "DR GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n "
676706 "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n "
677- "SQ SEQUENCE 1165 AA; 129118 MW; FE2984658CED53A8 CRC64;\n "
678- " MRVVVNAKAL EVPVGMSFTE WTRTLSPGSS PRFLAWNPVR PRTFKDVTDP FWNGKVFDLL\n "
679- " GVVNGKDDLL FPASEIQEWL EYAPNVDLAE LERIFVATHR HRGMMGFAAA VQDSLVHVDP\n "
680- " DSVDVTRVKD GLHKELDEHA SKAAATDVRL KRLRSVKPVD GFSDPVLIRT VFSVTVPEFG\n "
681- " DRTAYEIVDS AVPTGSCPYI SAGPFVKTIP GFKPAPEWPA QTAHAEGAVF FKADAEFPDT\n "
682- " KPLKDMYRKY SGAAVVPGDV TYPAVITFDV PQGSRHVPPE DFAARVAESL SLDLRGRPLV\n "
683- " EMGRVVSVRL DGMRFRPYVL TDLLVSDPDA SHVMQTDELN RAHKIKGTVY AQVCGTGQTV\n "
684- " SFQEKTDEDS GEAYISLRVR ARDRKGVEEL MEAAGRVMAI YSRRESEIVS FYALYDKTVA\n "
685- " KEAAPPRPPR KSKAPEPTGD KADRKLLRTL APDIFLPTYS RKCLHMPVIL RGAELEDARK\n "
686- " KGLNLMDFPL FGESERLTYA CKHPQHPYPG LRANLLPNKA KYPFVPCCYS KDQAVRPNSK\n "
687- " WTAYTTGNAE ARRQGRIREG VMQAEPLPEG ALIFLRRVLG QETGSKFFAL RTTGVPETPV\n "
688- " NAVHVAVFQR SLTAEEQAEE RAAMALDPSA MGACAQELYV EPDVDWDRWR REMGDPNVPF\n "
689- " NLLKYFRALE TRYDCDIYIM DNKGIIHTKA VRGRLRYRSR RPTVILHLRE ESCVPVMTPP\n "
690- " SDWTRGPVRN GILTFSPIDP ITVKLHDLYQ DSRPVYVDGV RVPPLRSDWL PCSGQVVDRA\n "
691- " GKARVFVVTP TGKMSRGSFT LVTWPMPPLA APILRTDTGF PRGRSDSPLS FLGSRFVPSG\n "
692- " YRRSVETGAI REITGILDGA CEACLLTHDP VLVPDPSWSD GGPPVYEDPV PSRALEGFTG\n "
693- " AEKKARMLVE YAKKAISIRE GSCTQESVRS FAANGGFVVS PGALDGMKVF NPRFEAPGPF\n "
694- " AEADWAVKVP DVKTARRLVY ALRVASVNGT CPVQEYASAS LVPNFYKTST DFVQSPAYTI\n "
695- " NVWRNDLDQS AVKKTRRAVV DWERGLAVPW PLPETELGFS YSLRFAGISR TFMAMNHPTW\n "
696- " ESAAFAALTW AKSGYCPGVT SNQIPEGEKV PTYACVKGMK PAKVLESGDG TLKLDKSSYG\n "
697- " DVRVSGVMIY RASEGKPMQY VSLLM\n "
707+ f"SQ SEQUENCE { len (protein_sq_1 * 25 )} AA; 129118 MW; FE2984658CED53A8 CRC64;\n "
708+ f" { protein_sq_1 * 25 } \n "
698709 "//\n "
710+ # Below protein has valid go class association but invalid amino acid `X` in its sequence
699711 "ID Swiss_Prot_4 Reviewed; 60 AA.\n "
700712 "AC Q6GZX4;\n "
701713 "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n "
@@ -705,18 +717,54 @@ def get_UniProt_raw_data() -> str:
705717 "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n "
706718 " XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n "
707719 "//\n "
720+ # Below protein with sequence string but has no GO class
708721 "ID Swiss_Prot_5 Reviewed; 60 AA.\n "
709722 "AC Q6GZX4;\n "
710723 "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n "
711724 "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n "
712725 " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n "
713726 "//\n "
714- "ID Swiss_Prot_5 Reviewed; 60 AA.\n "
727+ # Below protein with sequence string and with NO `valid` associated GO class (no evidence code)
728+ "ID Swiss_Prot_6 Reviewed; 60 AA.\n "
729+ "AC Q6GZX4;\n "
730+ "DR GO; GO:0000023; P:regulation of viral transcription;\n "
731+ "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n "
732+ " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n "
733+ "//\n "
734+ # Below protein with sequence string and with NO `valid` associated GO class (invalid evidence code)
735+ "ID Swiss_Prot_7 Reviewed; 60 AA.\n "
715736 "AC Q6GZX4;\n "
716- "DR GO; GO:0000005 ; P:regulation of viral transcription;\n "
737+ "DR GO; GO:0000024 ; P:regulation of viral transcription; IEA:SGD. \n "
717738 "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n "
718739 " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n "
719- "//"
740+ "//\n "
741+ # Below protein with sequence length greater than 1002 but with `Invalid` associated GO class
742+ f"ID Swiss_Prot_8 Reviewed; { len (protein_sq_2 * 25 )} AA.\n "
743+ "AC Q6GZX4;\n "
744+ "DR GO; GO:0000025; P:regulation of viral transcription; IC:Inferred.\n "
745+ f"SQ SEQUENCE { len (protein_sq_2 * 25 )} AA; 29735 MW; B4840739BF7D4121 CRC64;\n "
746+ f" { protein_sq_2 * 25 } \n "
747+ "//\n "
748+ # Below protein with sequence string but invalid amino acid `X` in its sequence
749+ "ID Swiss_Prot_9 Reviewed; 60 AA.\n "
750+ "AC Q6GZX4;\n "
751+ "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n "
752+ " XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n "
753+ "//\n "
754+ # Below protein with a `valid` associated GO class but without sequence string
755+ "ID Swiss_Prot_10 Reviewed; 60 AA.\n "
756+ "AC Q6GZX4;\n "
757+ "DR GO; GO:0000027; P:regulation of viral transcription; EXP:InterPro.\n "
758+ "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n "
759+ " \n "
760+ "//\n "
761+ # Below protein with a `Invalid` associated GO class but without sequence string
762+ "ID Swiss_Prot_11 Reviewed; 60 AA.\n "
763+ "AC Q6GZX4;\n "
764+ "DR GO; GO:0000028; P:regulation of viral transcription; ND:NoData.\n "
765+ "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n "
766+ " \n "
767+ "//\n "
720768 )
721769
722770 return raw_str
0 commit comments