|
23 | 23 | SQLiteDatabaseConnector, |
24 | 24 | ) |
25 | 25 | from convassist.predictor.utilities.svo_util import SVOUtil |
| 26 | +from convassist.utilities.utils import smart_readlines |
26 | 27 |
|
27 | 28 |
|
28 | 29 | class SentenceCompletionPredictor(Predictor): |
@@ -68,15 +69,13 @@ def configure(self): |
68 | 69 | # We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity |
69 | 70 | self.index = hnswlib.Index(space="cosine", dim=self.embedding_size) |
70 | 71 |
|
71 | | - with open(self.retrieve_database) as f: |
72 | | - self.corpus_sentences = [s.strip() for s in f.readlines()] |
| 72 | + self.corpus_sentences = smart_readlines(self.retrieve_database) |
73 | 73 |
|
74 | | - with open(self.blacklist_file) as f: |
75 | | - self.blacklist_words = [s.strip() for s in f.readlines()] |
| 74 | + self.blacklist_words = smart_readlines(self.blacklist_file) |
76 | 75 |
|
77 | 76 | self.personalized_allowed_toxicwords = self._read_personalized_toxic_words() |
78 | 77 |
|
79 | | - self.svo_util = SVOUtil(self.stopwordsFile) |
| 78 | + self.svo_util = SVOUtil(self.stopwordsFile, nlp_path=self._personalized_resources_path) |
80 | 79 |
|
81 | 80 | if not Path.is_file(Path(self.embedding_cache_path)): |
82 | 81 | self.corpus_embeddings = self.embedder.encode( |
@@ -174,8 +173,9 @@ def _read_personalized_toxic_words(self): |
174 | 173 | with open(self.personalized_allowed_toxicwords_file, "w") as f: |
175 | 174 | pass |
176 | 175 |
|
177 | | - with open(self.personalized_allowed_toxicwords_file) as f: |
178 | | - self.personalized_allowed_toxicwords = f.readlines() |
| 176 | + # with open(self.personalized_allowed_toxicwords_file) as f: |
| 177 | + # self.personalized_allowed_toxicwords = f.readlines() |
| 178 | + self.personalized_allowed_toxicwords = smart_readlines(self.personalized_allowed_toxicwords_file) |
179 | 179 |
|
180 | 180 | self.personalized_allowed_toxicwords = [ |
181 | 181 | s.strip() for s in self.personalized_allowed_toxicwords |
@@ -222,7 +222,7 @@ def _retrieve_fromDataset(self, context): |
222 | 222 | pred = Prediction() |
223 | 223 | probs = {} |
224 | 224 |
|
225 | | - lines = open(self.retrieve_database).readlines() |
| 225 | + lines = smart_readlines(self.retrieve_database) |
226 | 226 | retrieved = [] |
227 | 227 | totalsent = len(lines) |
228 | 228 | for each in lines: |
@@ -509,12 +509,11 @@ def predict(self, max_partial_prediction_size: int, filter: Optional[str] = None |
509 | 509 | def load_n_start_sentences(self, max_partial_prediction_size=-1): |
510 | 510 | predictions = Prediction() |
511 | 511 |
|
512 | | - with open(self.startsents) as f: |
513 | | - data = f.readlines() |
514 | | - for sentence in data[0:max_partial_prediction_size]: |
515 | | - predictions.add_suggestion( |
516 | | - Suggestion(sentence.strip(), float(1 / len(data)), self.predictor_name) |
517 | | - ) |
| 512 | + data = smart_readlines(self.startsents) |
| 513 | + for sentence in data[0:max_partial_prediction_size]: |
| 514 | + predictions.add_suggestion( |
| 515 | + Suggestion(sentence.strip(), float(1 / len(data)), self.predictor_name) |
| 516 | + ) |
518 | 517 | return predictions |
519 | 518 |
|
520 | 519 | # Base class method |
|
0 commit comments