-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsentiment_analysis_with_negation.py
More file actions
109 lines (89 loc) · 4.13 KB
/
sentiment_analysis_with_negation.py
File metadata and controls
109 lines (89 loc) · 4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
# this loads data from a cvs file
df = pd.read_csv("IMDB Dataset.csv")
# this map and converts the sentiment column to binary values
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
# Defining negation words
negation_words = ['not', "don't", "didn't", "isn't", "wasn't", "aren't", "weren't", "won't", "wouldn't",
"shouldn't", "can't", "couldn't", "no", "never", "none", "nor", "n't"]
# this is to apply the preprocessing function to the review column
def preprocess(text):
text = text.lower()
text = re.sub(r"<.*?>", "", text) # remove HTML tags
text = re.sub(r"http\S+", "", text) # remove URLs
text = re.sub(r"[^a-z\s']", "", text) # remove non-alphabetic characters
text = re.sub(r"\s+", " ", text).strip() # remove extra whitespace
return text
df['cleaned'] = df['review'].apply(preprocess)
# function to handle custom negation
# this function will handle negation by appending "not_" to the next word
def handle_negation(text):
words = text.split()
result = []
negate = False
for word in words:
if word in negation_words:
negate = True
result.append(word)
elif negate:
result.append("not_" + word)
negate = False
else:
result.append(word)
return " ".join(result)
# Data without negation handling
df['without_negation'] = df['cleaned']
# Data with negation handling
df['with_negation'] = df['cleaned'].apply(handle_negation)
# this helps to split the data into training and testing sets
X_train_wo, X_test_wo, y_train, y_test = train_test_split(df['without_negation'], df['sentiment'], test_size=0.2, random_state=42)
X_train_w, X_test_w, _, _ = train_test_split(df['with_negation'], df['sentiment'], test_size=0.2, random_state=42)
# this is to create a TF-IDF vectorizer to convert the text data into numerical features
vectorizer_wo = TfidfVectorizer(max_features=5000)
vectorizer_w = TfidfVectorizer(max_features=5000)
X_train_vec_wo = vectorizer_wo.fit_transform(X_train_wo)
X_test_vec_wo = vectorizer_wo.transform(X_test_wo)
X_train_vec_w = vectorizer_w.fit_transform(X_train_w)
X_test_vec_w = vectorizer_w.transform(X_test_w)
# this Train model without negation
model_wo = LogisticRegression(max_iter=200)
model_wo.fit(X_train_vec_wo, y_train)
predictions_wo = model_wo.predict(X_test_vec_wo)
print("\n=== Without Negation Handling ===")
print(f"Accuracy: {accuracy_score(y_test, predictions_wo):.4f}")
print(classification_report(y_test, predictions_wo))
# this Train model with negation
model_w = LogisticRegression(max_iter=200)
model_w.fit(X_train_vec_w, y_train)
predictions_w = model_w.predict(X_test_vec_w)
print("\n=== With Negation Handling ===")
print(f"Accuracy: {accuracy_score(y_test, predictions_w):.4f}")
print(classification_report(y_test, predictions_w))
# these are the Custom test samples
custom_samples = [
"I did not like this movie at all.",
"This is not good.",
"I don't think it's worth it.",
"I absolutely loved the storyline!",
"The movie was amazing and emotional."
]
# Preprocess and apply negation
samples_cleaned = [preprocess(text) for text in custom_samples]
samples_negated = [handle_negation(text) for text in samples_cleaned]
samples_vec_wo = vectorizer_wo.transform(samples_cleaned)
samples_vec_w = vectorizer_w.transform(samples_negated)
print("\n--- Custom Test Cases ---")
for i, text in enumerate(custom_samples):
pred_wo = model_wo.predict(samples_vec_wo[i])[0]
pred_w = model_w.predict(samples_vec_w[i])[0]
sentiment_wo = "positive" if pred_wo == 1 else "negative"
sentiment_w = "positive" if pred_w == 1 else "negative"
print(f"\nInput: {text}")
print(f"Without Negation Handling: {sentiment_wo}")
print(f"With Negation Handling : {sentiment_w}")