Skip to content

Commit a8823c8

Browse files
committed
add abstract DataReader for proteins repo to override token path
1 parent 508a47a commit a8823c8

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

chebai_proteins/preprocessing/reader.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
from abc import ABC
23
from pathlib import Path
34
from typing import List, Optional, Tuple
45
from urllib.error import HTTPError
@@ -12,7 +13,15 @@
1213
from esm.pretrained import load_model_and_alphabet_core
1314

1415

15-
class ProteinDataReader(TokenIndexerReader):
16+
class _ChebaiProteinsDataReader(DataReader, ABC):
17+
def __init__(self, *args, **kwargs):
18+
super().__init__(*args, **kwargs)
19+
# This to override the token directory path which points to `chebai` repo instead of `chebai-proteins` to
20+
# search for tokens.txt files for readers defined in `chebai-proteins` repository.
21+
self.dirname = os.path.dirname(__file__)
22+
23+
24+
class ProteinDataReader(TokenIndexerReader, _ChebaiProteinsDataReader):
1625
"""
1726
Data reader for protein sequences using amino acid tokens. This class processes raw protein sequences into a format
1827
suitable for model input by tokenizing them and assigning unique indices to each token.
@@ -122,7 +131,7 @@ def _read_data(self, raw_data: str) -> List[int]:
122131
return [self._get_token_index(aa) for aa in raw_data]
123132

124133

125-
class ESM2EmbeddingReader(DataReader):
134+
class ESM2EmbeddingReader(_ChebaiProteinsDataReader):
126135
"""
127136
A data reader to process protein sequences using the ESM2 model for embeddings.
128137

0 commit comments

Comments
 (0)