-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel_definition.py
More file actions
210 lines (181 loc) · 9.83 KB
/
model_definition.py
File metadata and controls
210 lines (181 loc) · 9.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
"""
This file contains the 'Model' class
This file was written with the help of AI
"""
from pathlib import Path
from preprocessing import preprocess_coha, filter_coha
from correlation import calculate_correlation
class Model:
ALLOWED_EMBEDDINGS = ['SGNS', 'CBOW', 'FastText', 'SVD_PPMI', 'PPMI']
ALLOWED_INDUCTION_ALGORITHMS = ['kNN', 'PaRaSim', 'RandomWalk', 'LinearRegression']
EMBEDDING_DICT = {'SGNS':Path('Embeddings/SGNS.model'),
'CBOW':Path('Embeddings/CBOW.model'),
'FastText': Path('Embeddings/fasttext_model.bin'),
'SVD_PPMI': Path('Embeddings/svd_ppmi_matrix.csv'),
'PPMI': Path('Embeddings/ppmi_matrix.npz')}
def __init__(self, embedding_name, induction_algorithm_name, vad_lexicon_path, vad_lexicon_name,
gold_standard_path = Path('VADLexica/goldEN.vad'), corpus_path = Path('COHA/wlp_1830s_ksq') ):
"""
:param embedding_name: str, name of the embedding to use. Can be 'PPMI', 'SVD_PPMI', 'SGNS', 'CBOW', or 'FastText'.
:param induction_algorithm_name: str, name of the induction algorithm to be used. Can take the values 'kNN',
'PaRaSim', 'RandomWalk', or 'LinearRegression'
:param vad_lexicon_path: str or Path object, path of the seed word vad lexicon to be used
:param vad_lexicon_name: str, name of the seed word vad lexicon to be used (important for saving)
:param gold_standard_path: str or Path object, path of the gold standard vad lexicon to be used.
:param corpus_path: str or Path object, path of unprocessed corpus.
"""
self.embedding_name = embedding_name
self.induction_algorithm_name = induction_algorithm_name
"""Preprocessing information"""
self.corpus_path = corpus_path
self.preprocessed_corpus_path = Path('preprocessed_corpus.json')
self.preprocessed_filtered_corpus_path = Path('preprocessed_filtered_corpus.json')
self.corpus_is_preprocessed = True if self.preprocessed_filtered_corpus_path.exists() else False
self.corpus_number_of_documents = 0
self.corpus_number_of_words = 0
"""Embedding information"""
self.embedding_path = self.EMBEDDING_DICT[embedding_name]
self.embedding_is_computed = True if self.embedding_path.exists() else False
"""Lexicon information"""
self.vad_lexicon_path = vad_lexicon_path
self.vad_lexicon_name = vad_lexicon_name
self.gold_standard_path = gold_standard_path
@property
def embedding_name(self):
return self._embedding_name
@embedding_name.setter
def embedding_name(self, value):
if value not in self.ALLOWED_EMBEDDINGS:
raise ValueError(f'Invalid: {value}, ´value has to be in {self.ALLOWED_EMBEDDINGS}')
self._embedding_name = value
@property
def induction_algorithm_name(self):
return self._induction_algorithm_name
@induction_algorithm_name.setter
def induction_algorithm_name(self, value):
if value not in self.ALLOWED_INDUCTION_ALGORITHMS:
raise ValueError(f'Invalid: {value}, value has to be in {self.ALLOWED_INDUCTION_ALGORITHMS}')
self._induction_algorithm_name = value
@property
def vad_lexicon_path(self):
return self._vad_lexicon_path
@vad_lexicon_path.setter
def vad_lexicon_path(self, value):
path_obj = Path(value) if not isinstance(value, Path) else value
if not path_obj.exists():
raise FileNotFoundError(f'VAD lexicon path does not exist: {path_obj}')
self._vad_lexicon_path = path_obj
@property
def gold_standard_path(self):
return self._gold_standard_path
@gold_standard_path.setter
def gold_standard_path(self, value):
path_obj = Path(value) if not isinstance(value, Path) else value
if not path_obj.exists():
raise FileNotFoundError(f'Gold Standard path does not exist: {path_obj}')
self._gold_standard_path = path_obj
@property
def corpus_path(self):
return self._corpus_path
@corpus_path.setter
def corpus_path(self, value):
path_obj = Path(value) if not isinstance(value, Path) else value
if not path_obj.exists():
raise FileNotFoundError(f'Corpus path does not exist: {path_obj}')
self._corpus_path = path_obj
"""METHODS"""
def preprocess_corpus(self):
"""
preprocesses the corpus and saves it under "preprocessed_corpus.json and preprocessed_filtered_corpus.json"
"""
if self.preprocessed_filtered_corpus_path.exists():
print('Preprocessed corpus already exists')
self.corpus_is_preprocessed = True
else:
print('Preprocessing corpus ...')
try:
self.corpus_number_of_documents, self.corpus_number_of_words = preprocess_coha(self.corpus_path)
if self.embedding_name in ['SVD_PPMI', 'PPMI']:
filter_coha(n=10)
print('Corpus preprocessing completed successfully!')
print(f'number of processed documents: {self.corpus_number_of_documents}.')
print(f'size of preprocessed corpus: {self.corpus_number_of_words} words.')
self.corpus_is_preprocessed = True
except FileNotFoundError as e:
print(f'File not found error during preprocessing: {e}')
except Exception as e:
print(f'Error during corpus preprocessing: {e}.')
print(f'Please remember that your corpus must have the same structure as the Example Corpus.\n'
f'This includes the .txt files!')
def train_embedding(self):
"""
trains the embedding and saves it under 'Embeddings'
"""
if not self.corpus_is_preprocessed:
print('Corpus is not preprocessed!')
print('Call method preprocess_corpus to preprocess the corpus.')
else:
if self.embedding_name == 'PPMI':
from Embeddings.PPMI import compute_ppmi_matrix
compute_ppmi_matrix(self.preprocessed_filtered_corpus_path)
self.embedding_is_computed = True
elif self.embedding_name == 'SVD_PPMI':
from Embeddings.SVDPPMI import compute_svd_ppmi_matrix
compute_svd_ppmi_matrix(self.preprocessed_filtered_corpus_path)
self.embedding_is_computed = True
elif self.embedding_name == 'FastText':
from Embeddings.FastText import compute_fasttext_matrix
compute_fasttext_matrix(self.preprocessed_corpus_path)
self.embedding_is_computed = True
elif self.embedding_name == 'SGNS':
from Embeddings.SGNS_and_CBOW import compute_sgns_matrix, compute_cbow_matrix
compute_sgns_matrix(self.preprocessed_corpus_path)
self.embedding_is_computed = True
else:
from Embeddings.SGNS_and_CBOW import compute_sgns_matrix, compute_cbow_matrix
compute_cbow_matrix(self.preprocessed_corpus_path)
self.embedding_is_computed = True
def induce_historical_vad_lexicon(self, save_folder_name = None):
"""
induces a historical VAD lexicon and saves it under
'HistoricalVAD/save_folder_name/vad_lexicon_name _ embedding_name _ historicalVAD.tsv'
:param save_folder_name: str, the name of the folder to save the historical VAD lexicon
"""
if save_folder_name is None:
save_folder_name = self.induction_algorithm_name
if not self.corpus_is_preprocessed:
print('Corpus is not preprocessed!')
print('Call method preprocess_corpus to preprocess the corpus.')
elif not self.embedding_is_computed:
print('Embedding is not computed!')
print('Call method train_embedding.')
else:
if self.induction_algorithm_name == 'kNN':
from kNN import kNN
kNN(self.embedding_name, self.vad_lexicon_path, self.gold_standard_path,
self.vad_lexicon_name, save_folder_name)
elif self.induction_algorithm_name == 'PaRaSim':
from PaRaSim import paRaSim
paRaSim(self.embedding_name, self.vad_lexicon_path, self.gold_standard_path,
self.vad_lexicon_name, save_folder_name)
elif self.induction_algorithm_name == 'RandomWalk':
from RandomWalk import random_walk
random_walk(self.embedding_name, self.vad_lexicon_path, self.gold_standard_path,
self.vad_lexicon_name, save_folder_name)
else:
from LinearRegression import linear_regression
linear_regression(self.embedding_name, self.vad_lexicon_path, self.gold_standard_path,
self.vad_lexicon_name, save_folder_name)
def compute_correlation(self, save_folder_name = None):
"""
computes correlation between predicted historical VAD values and the gold standard ones
:param save_folder_name:
:return: list of lists, [r_V, p_V, r_A, p_a, r_D, p_D, r_mean]
"""
if save_folder_name is None:
save_folder_name = self.induction_algorithm_name
path = Path(f'HistoricalVAD/{save_folder_name}/{self.vad_lexicon_name}_{self.embedding_name}_historicalVAD.tsv')
if not path.exists():
print('Historical VAD lexicon does not exist yet. Please run method "induce historical vad lexicon" first."')
else:
return calculate_correlation(self.gold_standard_path, path)