forked from Hardhat-Enterprises/smishing-frontend
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathUpdated model.py
More file actions
76 lines (51 loc) · 1.98 KB
/
Updated model.py
File metadata and controls
76 lines (51 loc) · 1.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#Install packages and librarys
import pandas as pd
import nltk
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re
# Ensure NLTK data is downloaded
nltk.download('punkt')
# Read Data set
df = pd.read_csv('spam.csv', encoding='latin1')
#Preprocessing
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
# Tokenization function
def tokenize_text(text):
text = re.sub(r'[^\w\s]', '', text) # Remove all punctuation (,)
tokens = nltk.word_tokenize(text)
return tokens
def extract_ngrams(text, n):
tokens = tokenize_text(text)
return list(ngrams(tokens, n))
def message_to_ngrams(message, n):
ngram_list = extract_ngrams(message, n)
return ' '.join(['_'.join(ngram) for ngram in ngram_list])
# Apply the n-gram extraction
n = 1 # Change value for N-gram type (1 = unigram, 2 = bigram etc...)
df['message_ngrams'] = df['message'].apply(lambda x: message_to_ngrams(x, n))
# Split data into features and labels
X = df['message_ngrams']
y = df['label']
df['message'][0] #plain text
X[0] #Bi-Gram pairs
# Convert text data into numerical data using CountVectorizer
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)
X_vectorized[0]
# Split data into 80% training 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)
# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)
#Model Evaluation
y_pred = model.predict(X_test)
print("Training Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))