File tree Expand file tree Collapse file tree 1 file changed +11
-2
lines changed
chebai_proteins/preprocessing Expand file tree Collapse file tree 1 file changed +11
-2
lines changed Original file line number Diff line number Diff line change 11import os
2+ from abc import ABC
23from pathlib import Path
34from typing import List , Optional , Tuple
45from urllib .error import HTTPError
1213from esm .pretrained import load_model_and_alphabet_core
1314
1415
15- class ProteinDataReader (TokenIndexerReader ):
16+ class _ChebaiProteinsDataReader (DataReader , ABC ):
17+ def __init__ (self , * args , ** kwargs ):
18+ super ().__init__ (* args , ** kwargs )
19+ # This to override the token directory path which points to `chebai` repo instead of `chebai-proteins` to
20+ # search for tokens.txt files for readers defined in `chebai-proteins` repository.
21+ self .dirname = os .path .dirname (__file__ )
22+
23+
24+ class ProteinDataReader (TokenIndexerReader , _ChebaiProteinsDataReader ):
1625 """
1726 Data reader for protein sequences using amino acid tokens. This class processes raw protein sequences into a format
1827 suitable for model input by tokenizing them and assigning unique indices to each token.
@@ -122,7 +131,7 @@ def _read_data(self, raw_data: str) -> List[int]:
122131 return [self ._get_token_index (aa ) for aa in raw_data ]
123132
124133
125- class ESM2EmbeddingReader (DataReader ):
134+ class ESM2EmbeddingReader (_ChebaiProteinsDataReader ):
126135 """
127136 A data reader to process protein sequences using the ESM2 model for embeddings.
128137
You can’t perform that action at this time.
0 commit comments