-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtrainwordtovec.py
More file actions
68 lines (35 loc) · 1.25 KB
/
trainwordtovec.py
File metadata and controls
68 lines (35 loc) · 1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from gensim.models import Word2Vec
negfilepath='./review_polarity_dataset/txt_sentoken/neg/'
posfilepath='./review_polarity_dataset/txt_sentoken/pos/'
filenames=[]
labels=[0 for i in range(1000)]
labels.extend([1 for i in range(1000)])
for i in os.listdir(negfilepath):
filenames.append(os.path.join(negfilepath,i))
for i in os.listdir(posfilepath):
filenames.append(os.path.join(posfilepath,i))
texts=[]
for file in filenames:
string=open(file).read()
texts.append(string)
def generator_function():
global texts
for i in texts:
sequence=text_to_word_sequence(i)
yield sequence
class MakeIter(object):
def __init__(self, generator_func, **kwargs):
self.generator_func = generator_func
self.kwargs = kwargs
def __iter__(self):
return self.generator_func(**self.kwargs)
iterable=MakeIter(generator_func=generator_function)
if __name__ == '__main__':
print('Training word Embeddings:')
model=Word2Vec(sentences=iterable,size=300,workers=6,min_count=1)
print('Training Complete')
path='./model.bin'
model.wv.save_word2vec_format(fname=path)
print("Embedding model saved at {}".format(path))