Fake-News-Detection/fnd.py at main · vj-vanshika/Fake-News-Detection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# -*- coding: utf-8 -*-
"""fnd.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1cpdXP1eVp4wwgtnXWNGXZCk1896NQKvu

ENVIRONMENT SETUP
"""

!pip install datasets evaluate transformers[sentencepiece]

#libraries import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
import torch
import torch.nn as nn

# Commented out IPython magic to ensure Python compatibility.
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
# %cd /content/drive/MyDrive

"""DATASET LOAD"""

#loading the dataset
true_data = pd.read_csv('a1_True.csv')
fake_data = pd.read_csv('a2_Fake.csv')

# Generate labels True or Fake under the Target Column in 'true_data' and 'fake_data'
true_data['Target'] = ['True']*len(true_data)
fake_data['Target'] = ['Fake']*len(fake_data)

#dataset merge by random mixing
data = true_data.append(fake_data).sample(frac=1).reset_index().drop(columns=['index'])


print(data.shape)
data.head()

# Target column values changing to 0/1. 1 for fake, 0 for true.
data['label'] = pd.get_dummies(data.Target)['Fake']
data.head()

"""Train-Test Split"""

# Train-Validation-Test set split into 70:15:15 ratio
# Train-test split
train_text, temp_text, train_labels, temp_labels = train_test_split(data['text'], data['label'],
                                                                    random_state=2000,
                                                                    test_size=0.3,
                                                                    stratify=data['Target'])
# Validation-Test split
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=2000,
                                                                test_size=0.5,
                                                                stratify=temp_labels)

"""pre-trained model loading

"""

#loading pre-trained model and tokenizer.
from transformers import AutoTokenizer,AutoModel
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

#histogram fo text in dataset visualization
seq_len = [len(text.split()) for text in train_text]

pd.Series(seq_len).hist(bins = 40,color='blue')
plt.xlabel('Number of Words')
plt.ylabel('Number of texts')

"""Tokenizing the train, test and validation set."""

#tokenization of each dataset
MAX_LENGTH = 15
# Tokenize  sequences in the train set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(), #list conversion for batchencodeplus
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)
# tokenize sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)
# tokenise  sequences in the test set
tokens_test=tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length= MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)

"""Constructing dataloaders for effective processing."""

#conversion to tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])            #inputid in tokenised dataset and attention masks
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

#constructing dataloaders

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32                                               #batch size definition

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)                     #using random sampler for training dataset
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)                     #sequential sampler for validation dataset
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

#freezing layers of bert.

for param in bert.parameters():
    param.requires_grad = False    #tuning can be done in 3 ways: using pretrained model only
                                   #freezing some layer
                                   #freezing whole architecture

"""Defining the model architecture.

"""

class BERT_Arch(nn.Module):
    def __init__(self, bert):
      super(BERT_Arch, self).__init__()
      self.bert = bert

      self.relu =  nn.ReLU()                    #relu activation function for transforming neg computed values to 0, and pos remains same
      self.fc1 = nn.Linear(768,512)             #input layer->passing weights and inputs to neurons
      self.dropout = nn.Dropout(0.1)              #dropout layer for dropping some parameters
      self.fc2 = nn.Linear(512,2)               #output layer
      self.softmax = nn.LogSoftmax(dim=1)       #additional softmax layer over output layer to convert results into specific task based
    def forward(self, sent_id, mask):           #forward pass , how everything is passing through the layers
      cls_hs = self.bert(sent_id, attention_mask=mask)['pooler_output']
                                                # pass the inputs to the model
      x = self.fc1(cls_hs)                      #inputs passing through input layer with randomized weights and computation being done
      x = self.relu(x)                          #applying relu activation function for conversion
      x = self.dropout(x)                       #dropping out same paramaters while other neurons are activated
      x = self.fc2(x)                         #results being passed to output layer
      x = self.softmax(x)                    # softmax further tuning predictions for being task specific
      return x

model = BERT_Arch(bert)

# Define the optimizer ie default adamw
from transformers import AdamW
optimizer = AdamW(model.parameters(),
                  lr = 1e-5)          # learning rate

cross_entropy  = nn.NLLLoss()         #loss function for computing loss btw actual and predicted values

epochs = 2               #training epochs

"""train and evaluation loop defination:"""

# Defining training and evaluation functions
def train():
  model.train()
  total_loss, total_accuracy = 0, 0

  for step,batch in enumerate(train_dataloader):                #iterating over batches
    if step % 50 == 0 and not step == 0:                        # update after every 50 batches.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r for r in batch]
    sent_id, mask, labels = batch
    model.zero_grad()                                           # gradient conversion to zero
    preds = model(sent_id, mask)
    loss = cross_entropy(preds, labels)                         #loss function called
    total_loss = total_loss + loss.item()                       #updating total loss
    loss.backward()                                             # backward pass to calculate the gradients for minimizing loss and updating weights
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)     # clip gradients to 1.0. It helps in preventing exploding gradient problem
    optimizer.step()
    preds=preds.detach().cpu().numpy()

  avg_loss = total_loss / len(train_dataloader)                 # compute training loss of the epoch

  return avg_loss                                 # returns the avg loss

def evaluate():
  print("\nEvaluating...")
  model.eval()
  total_loss, total_accuracy = 0, 0
  for step,batch in enumerate(val_dataloader):    # loop over batches
    if step % 50 == 0 and not step == 0:

      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
                                                  # Report progress
    batch = [t for t in batch]                    #batch to gpu
    sent_id, mask, labels = batch
    with torch.no_grad():                         # Deactivate autograd
      preds = model(sent_id, mask)
      loss = cross_entropy(preds,labels)          # Compute the validation loss between actual and predicted values
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
  avg_loss = total_loss / len(val_dataloader)         #validation loss of the epoch
  return avg_loss

"""Model Training:"""

# Train and predict
best_valid_loss = float('inf')
train_losses=[]
valid_losses=[]

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss = train()                       # train function call
    valid_loss = evaluate()                    # evaluate function call
    if valid_loss < best_valid_loss:              # saving the best model to drive
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'c2_new_model_weights.pt')
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

#loaing weights of best model , for no repetitive training of model
path = 'c1_new_model_weights.pt'
model.load_state_dict(torch.load(path))

"""Performance of the model:"""

with torch.no_grad():
  preds = model(test_seq, test_mask)      #passing the test datset to model for predictions
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))    #confusion matrix requirement

# testing on unseen data
unseen_news_text = ["Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing",
                    "WATCH: George W. Bush Calls Out Trump For Supporting White Supremacy",
                    "U.S. lawmakers question businessman at 2016 Trump Tower meeting: sources",
                    ]

# tokenize and encode sequences in the test set
MAX_LENGHT = 15
tokens_unseen = tokenizer.batch_encode_plus(
    unseen_news_text,
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

unseen_seq = torch.tensor(tokens_unseen['input_ids'])
unseen_mask = torch.tensor(tokens_unseen['attention_mask'])

with torch.no_grad():
  preds = model(unseen_seq, unseen_mask)
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)
preds

"""Interface GUI"""

#interface
!pip install gradio

import gradio as gr
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

path = 'c1_new_model_weights.pt'
model.load_state_dict(torch.load(path))

def result(News):
    #tokenise unseen
    MAX_LENGHT=15
    news_list = list(News.split(" "))
    tokens_news = tokenizer.batch_encode_plus(
     news_list,                                                   #tokenizing the inputs.
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
    )

    news_seq = torch.tensor(tokens_news['input_ids'])
    news_mask = torch.tensor(tokens_news['attention_mask'])
    with torch.no_grad():
      preds = model(news_seq, news_mask)
      preds = preds.detach().cpu().numpy()
      preds = np.argmax(preds, axis = 1)
    if(preds[0]==1):
      return " fake"
    else:                                      #checking if fake or true and return corresponding values
      return "not fake"

demo= gr.Interface(

    fn=result,
    inputs=gr.Textbox(lines=15, placeholder="Type News here"),   #gradio simple  implementation
    outputs='text')
demo.launch(debug=True)