Skip to content

Commit e87b90a

Browse files
Bump version to 0.2.0 and update Python dependency range; adjust pywin32 version for Windows compatibility
1 parent 507c5cf commit e87b90a

File tree

19 files changed

+3516
-2552
lines changed

19 files changed

+3516
-2552
lines changed

.vscode/settings.json

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,15 @@
44
],
55
"python.testing.unittestEnabled": false,
66
"python.testing.pytestEnabled": true,
7-
"python.testing.debugPort": 3000
7+
"python.testing.debugPort": 3000,
8+
"python-envs.defaultEnvManager": "ms-python.python:poetry",
9+
"python-envs.defaultPackageManager": "ms-python.python:poetry",
10+
"python-envs.pythonProjects": [
11+
{
12+
"path": "",
13+
"envManager": "ms-python.python:poetry",
14+
"packageManager": "ms-python.python:poetry"
15+
}
16+
],
17+
"sarif-viewer.connectToGithubCodeScanning": "off"
818
}

convassist/predictor/sentence_completion_predictor.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
SQLiteDatabaseConnector,
2424
)
2525
from convassist.predictor.utilities.svo_util import SVOUtil
26+
from convassist.utilities.utils import smart_readlines
2627

2728

2829
class SentenceCompletionPredictor(Predictor):
@@ -68,15 +69,13 @@ def configure(self):
6869
# We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity
6970
self.index = hnswlib.Index(space="cosine", dim=self.embedding_size)
7071

71-
with open(self.retrieve_database) as f:
72-
self.corpus_sentences = [s.strip() for s in f.readlines()]
72+
self.corpus_sentences = smart_readlines(self.retrieve_database)
7373

74-
with open(self.blacklist_file) as f:
75-
self.blacklist_words = [s.strip() for s in f.readlines()]
74+
self.blacklist_words = smart_readlines(self.blacklist_file)
7675

7776
self.personalized_allowed_toxicwords = self._read_personalized_toxic_words()
7877

79-
self.svo_util = SVOUtil(self.stopwordsFile)
78+
self.svo_util = SVOUtil(self.stopwordsFile, nlp_path=self._personalized_resources_path)
8079

8180
if not Path.is_file(Path(self.embedding_cache_path)):
8281
self.corpus_embeddings = self.embedder.encode(
@@ -174,8 +173,9 @@ def _read_personalized_toxic_words(self):
174173
with open(self.personalized_allowed_toxicwords_file, "w") as f:
175174
pass
176175

177-
with open(self.personalized_allowed_toxicwords_file) as f:
178-
self.personalized_allowed_toxicwords = f.readlines()
176+
# with open(self.personalized_allowed_toxicwords_file) as f:
177+
# self.personalized_allowed_toxicwords = f.readlines()
178+
self.personalized_allowed_toxicwords = smart_readlines(self.personalized_allowed_toxicwords_file)
179179

180180
self.personalized_allowed_toxicwords = [
181181
s.strip() for s in self.personalized_allowed_toxicwords
@@ -222,7 +222,7 @@ def _retrieve_fromDataset(self, context):
222222
pred = Prediction()
223223
probs = {}
224224

225-
lines = open(self.retrieve_database).readlines()
225+
lines = smart_readlines(self.retrieve_database)
226226
retrieved = []
227227
totalsent = len(lines)
228228
for each in lines:
@@ -509,12 +509,11 @@ def predict(self, max_partial_prediction_size: int, filter: Optional[str] = None
509509
def load_n_start_sentences(self, max_partial_prediction_size=-1):
510510
predictions = Prediction()
511511

512-
with open(self.startsents) as f:
513-
data = f.readlines()
514-
for sentence in data[0:max_partial_prediction_size]:
515-
predictions.add_suggestion(
516-
Suggestion(sentence.strip(), float(1 / len(data)), self.predictor_name)
517-
)
512+
data = smart_readlines(self.startsents)
513+
for sentence in data[0:max_partial_prediction_size]:
514+
predictions.add_suggestion(
515+
Suggestion(sentence.strip(), float(1 / len(data)), self.predictor_name)
516+
)
518517
return predictions
519518

520519
# Base class method

convassist/predictor/smoothed_ngram_predictor/canned_word_predictor.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
from tqdm import tqdm
77

8-
from convassist.predictor.utilities.nlp import NLP
98
from convassist.utilities.ngram.ngram_map import NgramMap
109
from convassist.utilities.ngram.ngramutil import NGramUtil
1110

@@ -23,7 +22,6 @@ def configure(self):
2322
def extract_svo(self, sent) -> str:
2423
return " ".join(self.svo_utils.extract_svo(sent))
2524

26-
2725
def recreate_database(self):
2826
"""
2927
Recreates the sentence and n-gram databases by adding new phrases and removing outdated ones.
@@ -55,5 +53,3 @@ def recreate_database(self):
5553
@property
5654
def startwords(self):
5755
return os.path.join(self._personalized_resources_path, self._startwords)
58-
59-

convassist/predictor/smoothed_ngram_predictor/general_word_predictor.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,20 @@
44
import collections
55
import json
66
import os
7+
from convassist.utilities.utils import smart_readlines
78

89
from convassist.predictor.smoothed_ngram_predictor.smoothed_ngram_predictor import SmoothedNgramPredictor
910

1011

1112
class GeneralWordPredictor(SmoothedNgramPredictor):
13+
1214
def configure(self):
1315
super().configure()
1416

1517
# Store the set of most frequent starting words based on an AAC dataset
1618
# These will be displayed during empty context
1719
if not os.path.isfile(self.startwords):
18-
aac_lines = open(self.aac_dataset).readlines()
20+
aac_lines = smart_readlines(self.aac_dataset)
1921
startwords = []
2022
for line in aac_lines:
2123
w = line.lower().split()[0]

convassist/predictor/smoothed_ngram_predictor/smoothed_ngram_predictor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def configure(self) -> None:
3333
def extract_svo(self, sent):
3434
return sent
3535

36-
def get_frequent_start_words(self, max_count = 10) -> Prediction:
36+
def get_frequent_start_words(self, max_count=10) -> Prediction:
3737
word_predictions = Prediction()
3838

3939
try:

convassist/predictor/utilities/canned_data.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: GPL-3.0-or-later
33

44
from convassist.utilities.databaseutils.sqllite_dbconnector import SQLiteDatabaseConnector
5+
from convassist.utilities.utils import smart_readlines
56

67

78
class cannedData:
@@ -99,8 +100,7 @@ def all_phrases_as_list(self) -> list:
99100
def _read_personalized_corpus(self, corpus_path):
100101
corpus = []
101102

102-
with open(corpus_path) as f:
103-
corpus = f.readlines()
104-
corpus = [s.strip() for s in corpus]
103+
corpus = smart_readlines(corpus_path)
104+
corpus = [s.strip() for s in corpus]
105105

106106
return corpus
Lines changed: 12 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,26 @@
11
# Copyright (C) 2024 Intel Corporation
22
# SPDX-License-Identifier: GPL-3.0-or-later
3-
43
import os
5-
import sys
6-
from pathlib import Path
7-
84
import spacy
9-
5+
from spacy.cli.download import download
106
from convassist.utilities.singleton import Singleton
117

128

139
class NLP(metaclass=Singleton):
14-
def __init__(self):
10+
def __init__(self, path: str):
11+
self.path = path
1512
self.nlp = self.load_nlp()
1613

1714
def load_nlp(self):
18-
# TODO: Move this to always download the model from the internet
19-
# TODO: Move this to a config file
20-
21-
nlp_loc = "en_core_web_sm"
22-
# spacy model is in _MEIPASS when running as a pyinstaller executable
23-
if hasattr(sys, "_MEIPASS"): # pragma: no cover
24-
base_path = sys._MEIPASS # type: ignore
25-
nlp_loc = os.path.join(base_path, nlp_loc)
26-
27-
if os.path.exists(nlp_loc):
28-
# Loading the model from a path
29-
child_dirs = [child for child in Path(nlp_loc).iterdir() if child.is_dir()]
30-
if len(child_dirs) > 0:
31-
nlp = spacy.load(child_dirs[0])
32-
33-
else:
34-
# Loading the model from the installed package
35-
if not spacy.util.is_package(nlp_loc):
36-
spacy.cli.download(nlp_loc)
37-
38-
nlp = spacy.load(nlp_loc)
39-
40-
return nlp
15+
nlp_model = "en_core_web_sm"
16+
17+
try:
18+
if not spacy.util.is_package(nlp_model):
19+
download(nlp_model)
20+
nlp = spacy.load(nlp_model)
21+
return nlp
22+
except Exception as e:
23+
raise RuntimeError(f"Failed to load spaCy model '{nlp_model}': {e}")
4124

4225
def get_nlp(self):
4326
return self.nlp

convassist/predictor/utilities/svo_util.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,30 @@
11

22
from convassist.predictor.utilities.nlp import NLP
33

4+
45
class SVOUtil:
5-
66
OBJECT_DEPS = {
7-
"dobj",
8-
"pobj",
9-
"dative",
10-
"attr",
11-
"oprd",
12-
"npadvmod",
13-
"amod",
14-
"acomp",
15-
"advmod",
7+
"dobj",
8+
"pobj",
9+
"dative",
10+
"attr",
11+
"oprd",
12+
"npadvmod",
13+
"amod",
14+
"acomp",
15+
"advmod",
1616
}
17-
17+
1818
SUBJECT_DEPS = {"nsubj", "nsubjpass", "csubj", "agent", "expl"}
1919

20-
def __init__(self, stopwordsFile):
21-
self.nlp = NLP().get_nlp()
22-
20+
def __init__(self, stopwordsFile, nlp_path=""):
21+
self.nlp = NLP(nlp_path).get_nlp()
22+
2323
self.stopwords = []
2424
with open(stopwordsFile) as f:
2525
self.stopwords = f.read().splitlines()
2626
self.stopwords = [word.strip() for word in self.stopwords]
2727

28-
2928
def extract_svo(self, sent) -> list[str]:
3029
doc = self.nlp(sent)
3130
sub = []

convassist/predictor_registry.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def _add_predictor(
9090

9191
def get_predictor_class(self, predictor_name, config):
9292

93-
#TODO: Fix this hack.
93+
# TODO: Fix this hack.
9494
# This is a hack to get the predictor class from the config file.
9595
# The config file should have a mapping of predictor_name to predictor_class
9696
# but two predictor classes were renamed and the config file was not updated.
@@ -100,7 +100,7 @@ def get_predictor_class(self, predictor_name, config):
100100
return "CannedWordPredictor"
101101
elif predictor_name == "DefaultSmoothedNgramPredictor":
102102
return "GeneralWordPredictor"
103-
else:
103+
else:
104104
return config.get(predictor_name, "predictor_class")
105105

106106
def model_status(self) -> bool:

convassist/utilities/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from charset_normalizer import from_path
2+
3+
4+
def smart_readlines(path):
5+
result = from_path(path).best()
6+
enc = result.encoding if result else "utf-8"
7+
with open(path, encoding=enc) as f:
8+
return f.readlines()

0 commit comments

Comments
 (0)