Skip to content

Commit a71b199

Browse files
committed
update swiss data for pretraining test
1 parent 282bc09 commit a71b199

File tree

1 file changed

+79
-31
lines changed

1 file changed

+79
-31
lines changed

tests/unit/mock_data/ontology_mock_data.py

Lines changed: 79 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -632,24 +632,52 @@ def protein_sequences() -> Dict[str, str]:
632632
),
633633
}
634634

635+
@staticmethod
636+
def proteins_for_pretraining() -> List[str]:
637+
"""
638+
Returns a list of protein IDs which will be used for pretraining based on mock UniProt data.
639+
640+
Proteins include those with:
641+
- No GO classes or invalid GO classes (missing required evidence codes).
642+
643+
Returns:
644+
List[str]: A list of protein IDs that do not meet validation criteria.
645+
"""
646+
return [
647+
"Swiss_Prot_5", # No GO classes associated
648+
"Swiss_Prot_6", # GO class with no evidence code
649+
"Swiss_Prot_7", # GO class with invalid evidence code
650+
]
651+
635652
@staticmethod
636653
def get_UniProt_raw_data() -> str:
637654
"""
638655
Get raw data in string format for UniProt proteins.
639656
640-
This mock data contains six Swiss-Prot proteins with different properties:
641-
- Swiss_Prot_1 and Swiss_Prot_2 are valid proteins.
642-
- Swiss_Prot_3 has a sequence length greater than 1002.
643-
- Swiss_Prot_4 contains "X", a non-valid amino acid in its sequence.
644-
- Swiss_Prot_5 has no GO IDs mapped to it.
645-
- Swiss_Prot_6 has GO IDs mapped, but no evidence codes.
657+
This mock data contains eleven Swiss-Prot proteins with different properties:
658+
- **Swiss_Prot_1**: A valid protein with three valid GO classes and one invalid GO class.
659+
- **Swiss_Prot_2**: Another valid protein with two valid GO classes and one invalid.
660+
- **Swiss_Prot_3**: Contains valid GO classes but has a sequence length > 1002.
661+
- **Swiss_Prot_4**: Has valid GO classes but contains an invalid amino acid, 'X'.
662+
- **Swiss_Prot_5**: Has a sequence but no GO classes associated.
663+
- **Swiss_Prot_6**: Has GO classes without any associated evidence codes.
664+
- **Swiss_Prot_7**: Has a GO class with an invalid evidence code.
665+
- **Swiss_Prot_8**: Has a sequence length > 1002 and has only invalid GO class.
666+
- **Swiss_Prot_9**: Has no GO classes but contains an invalid amino acid, 'X', in its sequence.
667+
- **Swiss_Prot_10**: Has a valid GO class but lacks a sequence.
668+
- **Swiss_Prot_11**: Has only Invalid GO class but lacks a sequence.
669+
670+
Note:
671+
A valid GO label is the one which has one of the following evidence code
672+
(EXP, IDA, IPI, IMP, IGI, IEP, TAS, IC).
646673
647674
Returns:
648675
str: The raw UniProt data in string format.
649676
"""
650677
protein_sq_1 = GOUniProtMockData.protein_sequences()["Swiss_Prot_1"]
651678
protein_sq_2 = GOUniProtMockData.protein_sequences()["Swiss_Prot_2"]
652679
raw_str = (
680+
# Below protein with 3 valid associated GO class and one invalid GO class
653681
f"ID Swiss_Prot_1 Reviewed; {len(protein_sq_1)} AA. \n"
654682
"AC Q6GZX4;\n"
655683
"DR GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n"
@@ -659,6 +687,7 @@ def get_UniProt_raw_data() -> str:
659687
f"SQ SEQUENCE {len(protein_sq_1)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
660688
f" {protein_sq_1}\n"
661689
"//\n"
690+
# Below protein with 2 valid associated GO class and one invalid GO class
662691
f"ID Swiss_Prot_2 Reviewed; {len(protein_sq_2)} AA.\n"
663692
"AC DCGZX4;\n"
664693
"DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
@@ -668,34 +697,17 @@ def get_UniProt_raw_data() -> str:
668697
f"SQ SEQUENCE {len(protein_sq_2)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
669698
f" {protein_sq_2}\n"
670699
"//\n"
671-
"ID Swiss_Prot_3 Reviewed; 1165 AA.\n"
700+
# Below protein with all valid associated GO class but sequence length greater than 1002
701+
f"ID Swiss_Prot_3 Reviewed; {len(protein_sq_1 * 25)} AA.\n"
672702
"AC Q6GZX4;\n"
673703
"DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
674704
"DR GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n"
675705
"DR GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n"
676706
"DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n"
677-
"SQ SEQUENCE 1165 AA; 129118 MW; FE2984658CED53A8 CRC64;\n"
678-
" MRVVVNAKAL EVPVGMSFTE WTRTLSPGSS PRFLAWNPVR PRTFKDVTDP FWNGKVFDLL\n"
679-
" GVVNGKDDLL FPASEIQEWL EYAPNVDLAE LERIFVATHR HRGMMGFAAA VQDSLVHVDP\n"
680-
" DSVDVTRVKD GLHKELDEHA SKAAATDVRL KRLRSVKPVD GFSDPVLIRT VFSVTVPEFG\n"
681-
" DRTAYEIVDS AVPTGSCPYI SAGPFVKTIP GFKPAPEWPA QTAHAEGAVF FKADAEFPDT\n"
682-
" KPLKDMYRKY SGAAVVPGDV TYPAVITFDV PQGSRHVPPE DFAARVAESL SLDLRGRPLV\n"
683-
" EMGRVVSVRL DGMRFRPYVL TDLLVSDPDA SHVMQTDELN RAHKIKGTVY AQVCGTGQTV\n"
684-
" SFQEKTDEDS GEAYISLRVR ARDRKGVEEL MEAAGRVMAI YSRRESEIVS FYALYDKTVA\n"
685-
" KEAAPPRPPR KSKAPEPTGD KADRKLLRTL APDIFLPTYS RKCLHMPVIL RGAELEDARK\n"
686-
" KGLNLMDFPL FGESERLTYA CKHPQHPYPG LRANLLPNKA KYPFVPCCYS KDQAVRPNSK\n"
687-
" WTAYTTGNAE ARRQGRIREG VMQAEPLPEG ALIFLRRVLG QETGSKFFAL RTTGVPETPV\n"
688-
" NAVHVAVFQR SLTAEEQAEE RAAMALDPSA MGACAQELYV EPDVDWDRWR REMGDPNVPF\n"
689-
" NLLKYFRALE TRYDCDIYIM DNKGIIHTKA VRGRLRYRSR RPTVILHLRE ESCVPVMTPP\n"
690-
" SDWTRGPVRN GILTFSPIDP ITVKLHDLYQ DSRPVYVDGV RVPPLRSDWL PCSGQVVDRA\n"
691-
" GKARVFVVTP TGKMSRGSFT LVTWPMPPLA APILRTDTGF PRGRSDSPLS FLGSRFVPSG\n"
692-
" YRRSVETGAI REITGILDGA CEACLLTHDP VLVPDPSWSD GGPPVYEDPV PSRALEGFTG\n"
693-
" AEKKARMLVE YAKKAISIRE GSCTQESVRS FAANGGFVVS PGALDGMKVF NPRFEAPGPF\n"
694-
" AEADWAVKVP DVKTARRLVY ALRVASVNGT CPVQEYASAS LVPNFYKTST DFVQSPAYTI\n"
695-
" NVWRNDLDQS AVKKTRRAVV DWERGLAVPW PLPETELGFS YSLRFAGISR TFMAMNHPTW\n"
696-
" ESAAFAALTW AKSGYCPGVT SNQIPEGEKV PTYACVKGMK PAKVLESGDG TLKLDKSSYG\n"
697-
" DVRVSGVMIY RASEGKPMQY VSLLM\n"
707+
f"SQ SEQUENCE {len(protein_sq_1 * 25)} AA; 129118 MW; FE2984658CED53A8 CRC64;\n"
708+
f" {protein_sq_1 * 25}\n"
698709
"//\n"
710+
# Below protein has valid go class association but invalid amino acid `X` in its sequence
699711
"ID Swiss_Prot_4 Reviewed; 60 AA.\n"
700712
"AC Q6GZX4;\n"
701713
"DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
@@ -705,18 +717,54 @@ def get_UniProt_raw_data() -> str:
705717
"SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
706718
" XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
707719
"//\n"
720+
# Below protein with sequence string but has no GO class
708721
"ID Swiss_Prot_5 Reviewed; 60 AA.\n"
709722
"AC Q6GZX4;\n"
710723
"DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n"
711724
"SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
712725
" MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
713726
"//\n"
714-
"ID Swiss_Prot_5 Reviewed; 60 AA.\n"
727+
# Below protein with sequence string and with NO `valid` associated GO class (no evidence code)
728+
"ID Swiss_Prot_6 Reviewed; 60 AA.\n"
729+
"AC Q6GZX4;\n"
730+
"DR GO; GO:0000023; P:regulation of viral transcription;\n"
731+
"SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
732+
" MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
733+
"//\n"
734+
# Below protein with sequence string and with NO `valid` associated GO class (invalid evidence code)
735+
"ID Swiss_Prot_7 Reviewed; 60 AA.\n"
715736
"AC Q6GZX4;\n"
716-
"DR GO; GO:0000005; P:regulation of viral transcription;\n"
737+
"DR GO; GO:0000024; P:regulation of viral transcription; IEA:SGD.\n"
717738
"SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
718739
" MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
719-
"//"
740+
"//\n"
741+
# Below protein with sequence length greater than 1002 but with `Invalid` associated GO class
742+
f"ID Swiss_Prot_8 Reviewed; {len(protein_sq_2 * 25)} AA.\n"
743+
"AC Q6GZX4;\n"
744+
"DR GO; GO:0000025; P:regulation of viral transcription; IC:Inferred.\n"
745+
f"SQ SEQUENCE {len(protein_sq_2 * 25)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
746+
f" {protein_sq_2 * 25}\n"
747+
"//\n"
748+
# Below protein with sequence string but invalid amino acid `X` in its sequence
749+
"ID Swiss_Prot_9 Reviewed; 60 AA.\n"
750+
"AC Q6GZX4;\n"
751+
"SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
752+
" XAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n"
753+
"//\n"
754+
# Below protein with a `valid` associated GO class but without sequence string
755+
"ID Swiss_Prot_10 Reviewed; 60 AA.\n"
756+
"AC Q6GZX4;\n"
757+
"DR GO; GO:0000027; P:regulation of viral transcription; EXP:InterPro.\n"
758+
"SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
759+
" \n"
760+
"//\n"
761+
# Below protein with a `Invalid` associated GO class but without sequence string
762+
"ID Swiss_Prot_11 Reviewed; 60 AA.\n"
763+
"AC Q6GZX4;\n"
764+
"DR GO; GO:0000028; P:regulation of viral transcription; ND:NoData.\n"
765+
"SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n"
766+
" \n"
767+
"//\n"
720768
)
721769

722770
return raw_str

0 commit comments

Comments
 (0)