NLP_APP/neural.py at master · Venky0892/NLP_APP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import pandas as pd
import numpy as np
import re
from datetime import datetime

import xlrd
import nltk
import dill as pickle
nltk.download('stopwords')
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Dropout
from tensorflow.keras.constraints import min_max_norm
from imblearn.over_sampling import SMOTE
import time
import streamlit as st
from log_util import log
import tensorflow as tf
from tensorflow import keras
# import kerastuner as kt


from tensorflow.keras.utils import *

from tensorflow.keras import backend as k
from tensorflow.keras import models
# from keras import models
# from keras import backend
# from tensorflow.keras.utils.generic_utils import transpose_shape
# # import _pywrap_tensorflow_internal
from tensorflow.keras.models import model_from_json

# NLP
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


@st.cache(suppress_st_warning=True, allow_output_mutation=True)
class Neural_model:
    def __init__(self, dataset,date, engagement, wordcount, text):
        self.dataset = dataset
        self.date = date
        self.engagement = engagement
        self.wordcount = wordcount
        self.textcolumn = text

    def REPLACE_BY_SPACE_RE(self):
        try:
             REPLACE_BY_SPACE_RE = re.compile("[/(){}\[\]\|@,;!]")

        except Exception as e:
            log('----------Error in REPLACE_BY_SPACE_RE function ----------:{}'.format(e), 'error')
            raise e
        return REPLACE_BY_SPACE_RE

    def BAD_SYMBOLS_RE(self):
        try:
            BAD_SYMBOLS_RE = re.compile("[^0-9a-z #+_]")
        except Exception as e:
            log('----------Error in BAD_SYMBOLS_RE function ----------:{}'.format(e), 'error')
            raise e
        return BAD_SYMBOLS_RE

    @st.cache(suppress_st_warning=True, allow_output_mutation=True)
    def stopwords_update(self):

        negation = ["no", "nor", "not", "don", "don't", "aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't",
                  "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "mightn", "mightn't", "mustn", "mustn't",
                  "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't",  'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
        stop = set(stopwords.words('english')) - set(negation)

        # Custom stopwords
        stoplist = ['i','me','my','myself','we','our','ours','ourselves','you',"you're","you've","you'll","you'd",'your',
                  'yours','yourself','yourselves','he','him','his','himself','she',"she's",'her','hers','herself','it',
                  "it's",'its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that',"that'll",
                  'these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did',
                  'doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about',
                  'against','between','into','through','during','before','after','above','below','to','from','up','down','in','out',
                  'on','off','over','under','again','further','then','once','here','there','when','where','why','all','any',
                  'both','each','few','more','most','other','some','such','only','own','same','so','than','too',
                  'very','s','t','can','will','just','should',"should've",'now','d','ll','m','o','re','ve','y','rt','rt','qt','for',
                  'the','with','in','of','and','its','it','this','i','have','has','would','could','you','a','an',
                  'be','am','can','edushopper','will','to','on','is','by','ive','im','your','we','are','at','as','any','ebay','thank','hello','know',
                  'need','want','look','hi','sorry','http','body','dear','hello','hi','thanks','sir','tomorrow','sent','send','see','there','welcome','what','well','us']

        stop = stop.update(set(stoplist))
        return stop

    @st.cache(suppress_st_warning=True, allow_output_mutation=True)
    def read_data(self):
        try:
            data = pd.read_excel(self.dataset, parse_dates=[self.date])
            data[self.date] = pd.to_datetime(data[self.date], errors='coerce')
            # data[self.date] = data[self.date].dt.strftime('%Y/%m/%d')
            data[self.engagement] = data[self.engagement].astype(int)
            data[self.wordcount] = data[self.wordcount].astype(int)
            # source['Tweet_type'] = source['Tweet_type'].astype('category')
            print(data[[self.date, self.textcolumn, self.wordcount, self.engagement]])
            source = data[[self.date, self.textcolumn, self.wordcount, self.engagement]]


        except Exception as e:
            log('----------Error in Read data ----------:{}'.format(e), 'error')
            raise e
        return source


    # def copy_source(self, data = None):
    #   data = data.copy()
    #   return data
    #
    # copy = copy_source(data)

    @st.cache(suppress_st_warning=True, allow_output_mutation=True)
    def text_preprocess(self, text):
        try:
            """
                text: a string

                return: modified initial string
            """
            negation = ["no", "nor", "not", "don", "don't", "aren", "aren't", "couldn", "couldn't", "didn", "didn't",
                        "doesn", "doesn't",
                        "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "mightn", "mightn't",
                        "mustn", "mustn't",
                        "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
                        'won', "won't", 'wouldn', "wouldn't"]
            stop = set(stopwords.words('english')) - set(negation)

            # Custom stopwords
            stoplist = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
                        "you'd", 'your',
                        'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
                        'herself', 'it',
                        "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who',
                        'whom', 'this', 'that', "that'll",
                        'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
                        'having', 'do', 'does', 'did',
                        'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
                        'by', 'for', 'with', 'about',
                        'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
                        'from', 'up', 'down', 'in', 'out',
                        'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where',
                        'why', 'all', 'any',
                        'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than',
                        'too',
                        'very', 's', 't', 'can', 'will', 'just', 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
                        've', 'y', 'rt', 'rt', 'qt', 'for',
                        'the', 'with', 'in', 'of', 'and', 'its', 'it', 'this', 'i', 'have', 'has', 'would', 'could', 'you',
                        'a', 'an',
                        'be', 'am', 'can', 'edushopper', 'will', 'to', 'on', 'is', 'by', 'ive', 'im', 'your', 'we', 'are',
                        'at', 'as', 'any', 'ebay', 'thank', 'hello', 'know',
                        'need', 'want', 'look', 'hi', 'sorry', 'http', 'body', 'dear', 'hello', 'hi', 'thanks', 'sir',
                        'tomorrow', 'sent', 'send', 'see', 'there', 'welcome', 'what', 'well', 'us']

            stop.update(set(stoplist))
            REPLACE_BY_SPACE_RE = re.compile("[/(){}\[\]\|@,;!]")
            BAD_SYMBOLS_RE = re.compile("[^0-9a-z #+_]")
            text = re.sub(r'\d', '', str(text))  # removing digits
            text = re.sub(r"(?:\@|https?\://)\S+", "", str(text))  # removing mentions and urls
            text = text.lower()  # lowercase text
            text = re.sub('[0-9]+', '', text)
            text = REPLACE_BY_SPACE_RE.sub(" ", text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
            text = BAD_SYMBOLS_RE.sub(" ", text)  # delete symbols which are in BAD_SYMBOLS_RE from text
            text = ' '.join([word for word in text.split() if word not in stop])  # delete stopwors from text
            text = text.strip()
        except Exception as e:
            log('----------Error in Text Processing ----------:{}'.format(e), 'error')
            raise e
        return text

    @st.cache(suppress_st_warning=True, allow_output_mutation=True)
    def data_processing(self, data):

        try:
            data[self.textcolumn] = data[self.textcolumn].apply(self.text_preprocess)
            # data['content'] = self.text_preprocess(data['content'])

            data = data[data[self.engagement] > 0]

            data['engagement_bucket'] = pd.qcut(data[self.engagement], q=[0,0.5, 0.75, 1], labels=['Low', 'Medium', 'High'])

            # Creating time related features such as time, day, etc.
            data['day'] = data[self.date].dt.day
            data['hour'] = data[self.date].dt.hour
            data['week_day'] = data[self.date].dt.weekday

            # hour = data.groupby('hour')[self.engagement].mean()
            # weekday = data.groupby('week_day')[self.engagement].mean()
            # dayofmonth = data.groupby('day')[self.engagement].mean()

            X = data[['word_count', 'hour', 'week_day']]
            X = pd.get_dummies(X, drop_first=True)

            X[self.textcolumn] = data[self.textcolumn]
            X.reset_index(drop=True,inplace=True)

            y= data['engagement_bucket']
            # y = pd.get_dummies(y)
        except Exception as e:
            log('----------Error in Data Processing ----------:{}'.format(e), 'error')
            raise e
        return X, y

    @st.cache(suppress_st_warning=True, allow_output_mutation=True)
    def TfidfVectorizer(self, X, y ):
        try:
            vec = TfidfVectorizer(strip_accents='unicode', ngram_range=(1,2), max_features=3000, smooth_idf=True, sublinear_tf=True)
            train_vec = vec.fit_transform(X[self.textcolumn])

            _train = np.hstack([X.drop(self.textcolumn, axis=1), train_vec.toarray()])
            y = LabelEncoder().fit_transform(y)
            scaler = Normalizer().fit(_train)
            _train = scaler.transform(_train)
        except Exception as e:
            log('----------Error in TfidVectorizer ----------:{}'.format(e), 'error')
            raise e
        return _train , y

    @st.cache(suppress_st_warning=True, allow_output_mutation=True)
    def balancing(self,_train , y ):
        try:
            smote = SMOTE('minority')
            _train, y = smote.fit_sample(_train, y)
        except Exception as e:
            log('----------Error in Smote ----------:{}'.format(e), 'error')
            raise e
        return _train, y

    @st.cache(suppress_st_warning=True, allow_output_mutation=True)
    def model(self, kernel_initializer='glorot_uniform',  activation = 'relu', dropout_rate=0.5, weight_constraint=0):
        try:
        # define the keras model
            model = Sequential()
            model.add(Dense(300, input_dim=3003, activation=activation, kernel_initializer=kernel_initializer, kernel_constraint=min_max_norm(min_value=1.0, max_value=1.0)))
            model.add(Dropout(dropout_rate))
            model.add(Dense(200, activation=activation, kernel_initializer=kernel_initializer,kernel_constraint=min_max_norm(min_value=1.0, max_value=1.0)))
            model.add(Dropout(dropout_rate))
            model.add(Dense(100, activation=activation, kernel_initializer=kernel_initializer, kernel_constraint=min_max_norm(min_value=1.0, max_value=1.0)))
            model.add(Dropout(dropout_rate))
            model.add(Dense(3, activation='softmax', kernel_initializer=kernel_initializer, kernel_constraint=min_max_norm(min_value=1.0, max_value=1.0)))

            # compile the keras model
            # optimizer = SGD(lr=learn_rate, momentum=momentum)
            model.compile(loss='sparse_categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
        except Exception as e:
            log('----------Error in Clean_ts function ----------:{}'.format(e), 'error')
            raise e
        return model


    def final_optimised_model(self, model, _train, y, epochs=None, batch_size=None):
        try:
            # fit the keras model on the dataset
            model.fit(_train, y, epochs=epochs, batch_size=batch_size)

            # evaluate the keras model
            _, accuracy = model.evaluate(_train, y)
            print('Accuracy: %.2f' % (accuracy*100))

            # make class predictions with the model
            # predictions_2 = model.predict_classes(_train)
            time.sleep(10)
            try:
                # serialize model to JSON
                model_json = model.to_json()
                with open("model.json", "w") as json_file:
                    json_file.write(model_json)
                # serialize weights to HDF5
                model.save_weights("model.h5")
                time.sleep(100)
            except Exception as e:
                log('----------Error in Model.json ----------:{}'.format(e), 'error')
                raise e

            # if 'model.json' and 'model.h5':

            # load json and create model


            print("End of the model running")
        except Exception as e:
            log('----------Error in Clean_ts function ----------:{}'.format(e), 'error')
            raise e
        return

    @st.cache(suppress_st_warning=True, allow_output_mutation=True)
    def updating_hyperameters(self, create_model=None, X=None, Y=None):
        try:
            model = KerasClassifier(build_fn=create_model, verbose=1)
            # define the grid search parameters
            batch_size = [10, 20, 40, 60, 80, 100]
            epochs = [10, 50, 100]
            optimizer = ['SGD', 'Adam']
            learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
            momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
            # init_mode = ['uniform', 'glorot_uniform',  'glorot_normal', 'normal', 'zero']
            # activation = ['softmax', 'relu', 'tanh', 'sigmoid',  'linear']
            weight_constraint = [1, 2, 3, 4, 5]
            # dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
            param_grid = dict(batch_size=batch_size, epochs=epochs)
            grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
            grid_result = grid.fit(X, Y)
            best_score = grid_result.best_score_
            best_params = grid_result.best_params_
        except Exception as e:
            log('----------Error in updating Parameters ----------:{}'.format(e), 'error')
            raise e
        return best_score, best_params
    def optimized_model(self):
        try:
            model = self.model()
            loaded_model = self.final_optimised_model(model, self._train, self.label,self.best_params['epochs'],
                                                      self.best_params['batch_size'])
        except Exception as e:
            log('----------Error in Optimized Modeln ----------:{}'.format(e), 'error')
            raise e
        return loaded_model
    @st.cache(suppress_st_warning=True, allow_output_mutation=True)
    def result(self):
        try:
            data = self.read_data()
            X,y = self.data_processing(data)
            _train, y = self.TfidfVectorizer(X,y)
            _train, y = self.balancing(_train, y)
            self._train = _train
            self.label = y
            best_score, best_params = self.updating_hyperameters(self.model, _train, y)
            self.best_params = best_params
        except Exception as e:
            log('----------Error in Result ----------:{}'.format(e), 'error')
            raise e
        return self.optimized_model()