-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtrain_sentiment_model.py
More file actions
31 lines (24 loc) · 1.14 KB
/
train_sentiment_model.py
File metadata and controls
31 lines (24 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
import pandas as pd
import numpy as np
# aws
from utils_s3 import get_etf_holdings, list_keys
# gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim.utils import simple_preprocess
from gensim import similarities
from gensim.parsing.porter import PorterStemmer
labeled_data = pd.read_csv('model_inputs/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt', sep='@',
engine='python', header=None, names=['lines', 'label'])
# tokenize and remove punctuation
labeled_data['lines'] = labeled_data['lines'].apply(lambda x: [w for w in simple_preprocess(x, deacc=True)])
# remove stopwords and uppercase words
labeled_data['lines'] = labeled_data['lines'].apply(lambda x: [remove_stopwords(w) for w in x])
labeled_data['lines'] = labeled_data['lines'].apply(lambda x: [w for w in x if (2 <= len(w) < 15)])
test = models.Word2Vec()
test.build_vocab(labeled_data['lines'], progress_per=1000)
test.train(labeled_data['lines'], total_examples=test.corpus_count, epochs=test.epochs)
test.init_sims(replace=True)
test.wv.most_similar(positive=["technology"])