Skip to content

Commit 6b362ef

Browse files
authored
Merge pull request #1361 from Shariq2003/AdditionOfAITextDetector
feat(Addition):Addition of AI_Text_Detector(#1307)
2 parents bfb9946 + 07f6302 commit 6b362ef

File tree

11 files changed

+638
-0
lines changed

11 files changed

+638
-0
lines changed
Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# GloVe EMBEDDING TECHNIQUE"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import numpy as np\n",
17+
"\n",
18+
"def load_glove_embeddings(file_path):\n",
19+
" embeddings_index = {}\n",
20+
" with open(file_path, encoding=\"utf-8\") as f:\n",
21+
" for line in f:\n",
22+
" values = line.split()\n",
23+
" word = values[0]\n",
24+
" coefs = np.asarray(values[1:], dtype=\"float32\")\n",
25+
" embeddings_index[word] = coefs\n",
26+
" return embeddings_index\n",
27+
"\n",
28+
"glove_file_path = \"glove.6B.100d.txt\" # Adjust the path based on your downloaded file\n",
29+
"glove_embeddings = load_glove_embeddings(glove_file_path)\n"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"# TOKENIZE TEXT"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": 2,
42+
"metadata": {},
43+
"outputs": [],
44+
"source": [
45+
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
46+
"from sklearn.model_selection import train_test_split\n",
47+
"import pandas as pd\n",
48+
"\n",
49+
"dataset = pd.read_csv(\"AI_Human_Essay.csv\")\n",
50+
"\n",
51+
"X_train, X_test, y_train, y_test = train_test_split(\n",
52+
" dataset['text'], dataset['generated'].astype(int), test_size=0.2, random_state=42\n",
53+
")\n",
54+
"\n",
55+
"max_words = 10000 # Choose an appropriate value\n",
56+
"tokenizer = Tokenizer(num_words=max_words)\n",
57+
"tokenizer.fit_on_texts(X_train)\n",
58+
"X_train_sequences = tokenizer.texts_to_sequences(X_train)\n",
59+
"X_test_sequences = tokenizer.texts_to_sequences(X_test)\n"
60+
]
61+
},
62+
{
63+
"cell_type": "markdown",
64+
"metadata": {},
65+
"source": [
66+
"# PAD SEQUENCES"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": 3,
72+
"metadata": {},
73+
"outputs": [],
74+
"source": [
75+
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
76+
"\n",
77+
"max_sequence_length = 100 # Choose an appropriate value\n",
78+
"X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)\n",
79+
"X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)\n"
80+
]
81+
},
82+
{
83+
"cell_type": "markdown",
84+
"metadata": {},
85+
"source": [
86+
"# CREATE EMBEDDING MATRIX"
87+
]
88+
},
89+
{
90+
"cell_type": "code",
91+
"execution_count": 4,
92+
"metadata": {},
93+
"outputs": [],
94+
"source": [
95+
"word_index = tokenizer.word_index\n",
96+
"embedding_dim = 100 # Use the same dimension as your GloVe file\n",
97+
"embedding_matrix = np.zeros((max_words, embedding_dim))\n",
98+
"for word, i in word_index.items():\n",
99+
" if i < max_words:\n",
100+
" embedding_vector = glove_embeddings.get(word)\n",
101+
" if embedding_vector is not None:\n",
102+
" embedding_matrix[i] = embedding_vector"
103+
]
104+
},
105+
{
106+
"cell_type": "markdown",
107+
"metadata": {},
108+
"source": [
109+
"# BUILD THE MODEL"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": 5,
115+
"metadata": {},
116+
"outputs": [
117+
{
118+
"name": "stderr",
119+
"output_type": "stream",
120+
"text": [
121+
"c:\\Users\\ShariqSD\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\keras\\src\\layers\\core\\embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it.\n",
122+
" warnings.warn(\n"
123+
]
124+
},
125+
{
126+
"name": "stdout",
127+
"output_type": "stream",
128+
"text": [
129+
"Epoch 1/3\n",
130+
"\u001b[1m12181/12181\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5317s\u001b[0m 436ms/step - accuracy: 0.8920 - loss: 0.3980 - val_accuracy: 0.9496 - val_loss: 0.1888\n",
131+
"Epoch 2/3\n",
132+
"\u001b[1m12181/12181\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4890s\u001b[0m 397ms/step - accuracy: 0.9500 - loss: 0.1801 - val_accuracy: 0.9626 - val_loss: 0.1449\n",
133+
"Epoch 3/3\n",
134+
"\u001b[1m12181/12181\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4077s\u001b[0m 335ms/step - accuracy: 0.9539 - loss: 0.1708 - val_accuracy: 0.9640 - val_loss: 0.1456\n"
135+
]
136+
},
137+
{
138+
"data": {
139+
"text/plain": [
140+
"<keras.src.callbacks.history.History at 0x14d81da2990>"
141+
]
142+
},
143+
"execution_count": 5,
144+
"metadata": {},
145+
"output_type": "execute_result"
146+
}
147+
],
148+
"source": [
149+
"from tensorflow.keras.models import Sequential\n",
150+
"from tensorflow.keras.layers import Embedding, LSTM, Dense, BatchNormalization, Dropout\n",
151+
"from tensorflow.keras.regularizers import l2\n",
152+
"\n",
153+
"model = Sequential()\n",
154+
"model.add(Embedding(max_words, embedding_dim, input_length=max_sequence_length, weights=[embedding_matrix], trainable=False))\n",
155+
"model.add(LSTM(256, return_sequences=True, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001))) \n",
156+
"model.add(Dropout(0.5)) # Increased dropout rate\n",
157+
"model.add(LSTM(128, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001)))\n",
158+
"model.add(Dropout(0.5)) # Increased dropout rate\n",
159+
"model.add(Dense(64, activation=\"relu\"))\n",
160+
"model.add(BatchNormalization())\n",
161+
"model.add(Dense(1, activation=\"sigmoid\"))\n",
162+
"\n",
163+
"# Compile the model\n",
164+
"model.compile(optimizer=\"adam\", loss=\"binary_crossentropy\", metrics=[\"accuracy\"])\n",
165+
"\n",
166+
"# Train the model with more epochs\n",
167+
"model.fit(X_train_padded, y_train, epochs=3, validation_data=(X_test_padded, y_test))\n"
168+
]
169+
},
170+
{
171+
"cell_type": "code",
172+
"execution_count": 6,
173+
"metadata": {},
174+
"outputs": [
175+
{
176+
"name": "stdout",
177+
"output_type": "stream",
178+
"text": [
179+
"\u001b[1m3046/3046\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m341s\u001b[0m 112ms/step\n",
180+
"Classification Report:\n",
181+
" precision recall f1-score support\n",
182+
"\n",
183+
" 0 0.97 0.98 0.97 61112\n",
184+
" 1 0.96 0.94 0.95 36335\n",
185+
"\n",
186+
" accuracy 0.96 97447\n",
187+
" macro avg 0.96 0.96 0.96 97447\n",
188+
"weighted avg 0.96 0.96 0.96 97447\n",
189+
"\n",
190+
"Confusion Matrix:\n",
191+
"[[59679 1433]\n",
192+
" [ 2071 34264]]\n"
193+
]
194+
}
195+
],
196+
"source": [
197+
"import numpy as np\n",
198+
"from sklearn.metrics import classification_report, confusion_matrix\n",
199+
"\n",
200+
"# Make predictions on test data\n",
201+
"y_pred_probs = model.predict(X_test_padded)\n",
202+
"y_pred = np.round(y_pred_probs).astype(int)\n",
203+
"\n",
204+
"# Convert probabilities to binary class labels using a threshold (e.g., 0.5)\n",
205+
"# y_pred = (y_pred_probs > 0.5).astype(int)\n",
206+
"\n",
207+
"# Compute classification report\n",
208+
"print(\"Classification Report:\")\n",
209+
"print(classification_report(y_test, y_pred))\n",
210+
"\n",
211+
"# Compute confusion matrix\n",
212+
"print(\"Confusion Matrix:\")\n",
213+
"print(confusion_matrix(y_test, y_pred))\n"
214+
]
215+
},
216+
{
217+
"cell_type": "markdown",
218+
"metadata": {},
219+
"source": [
220+
"# Classification Report:\n",
221+
" precision recall f1-score support\n",
222+
"\n",
223+
" 0 0.97 0.99 0.98 3539\n",
224+
" 1 0.98 0.96 0.97 2290\n",
225+
"\n",
226+
" accuracy 0.98 5829\n",
227+
" macro avg 0.98 0.97 0.98 5829\n",
228+
"weighted avg 0.98 0.98 0.98 5829\n",
229+
"\n",
230+
"# Confusion Matrix:\n",
231+
"[[3496 43]\n",
232+
" [ 91 2199]] "
233+
]
234+
},
235+
{
236+
"cell_type": "code",
237+
"execution_count": 7,
238+
"metadata": {},
239+
"outputs": [
240+
{
241+
"name": "stdout",
242+
"output_type": "stream",
243+
"text": [
244+
"\u001b[1m3046/3046\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m336s\u001b[0m 110ms/step\n",
245+
"Classification Report:\n",
246+
" precision recall f1-score support\n",
247+
"\n",
248+
" 0 0.97 0.98 0.97 61112\n",
249+
" 1 0.96 0.94 0.95 36335\n",
250+
"\n",
251+
" accuracy 0.96 97447\n",
252+
" macro avg 0.96 0.96 0.96 97447\n",
253+
"weighted avg 0.96 0.96 0.96 97447\n",
254+
"\n",
255+
"Confusion Matrix:\n",
256+
"[[59679 1433]\n",
257+
" [ 2071 34264]]\n"
258+
]
259+
}
260+
],
261+
"source": [
262+
"# Perform predictions\n",
263+
"y_pred_prob = model.predict(X_test_padded)\n",
264+
"y_pred = (y_pred_prob > 0.5).astype('int32')\n",
265+
"\n",
266+
"# Print classification report\n",
267+
"print(\"Classification Report:\")\n",
268+
"print(classification_report(y_test, y_pred))\n",
269+
"\n",
270+
"# Print confusion matrix\n",
271+
"print(\"Confusion Matrix:\")\n",
272+
"print(confusion_matrix(y_test, y_pred))\n"
273+
]
274+
},
275+
{
276+
"cell_type": "code",
277+
"execution_count": 8,
278+
"metadata": {},
279+
"outputs": [
280+
{
281+
"name": "stdout",
282+
"output_type": "stream",
283+
"text": [
284+
"Enter the text to classify:\n",
285+
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 110ms/step\n",
286+
"AI-generated\n"
287+
]
288+
}
289+
],
290+
"source": [
291+
"def predict_text_origin(model, tokenizer, max_sequence_length):\n",
292+
" # Prompt user to enter text\n",
293+
" print(\"Enter the text to classify:\")\n",
294+
" input_text = input()\n",
295+
"\n",
296+
" # Tokenize and pad the input text\n",
297+
" input_sequence = tokenizer.texts_to_sequences([input_text])\n",
298+
" input_padded = pad_sequences(input_sequence, maxlen=max_sequence_length)\n",
299+
"\n",
300+
" # Predict the class probability\n",
301+
" prediction = model.predict(input_padded)\n",
302+
"\n",
303+
" # Determine the predicted class label\n",
304+
" predicted_label = \"AI-generated\" if prediction[0] >= 0.5 else \"Human-generated\"\n",
305+
"\n",
306+
" print( predicted_label)\n",
307+
"\n",
308+
"# Example usage:\n",
309+
"predicted_class = predict_text_origin(model, tokenizer, max_sequence_length)\n",
310+
"\n"
311+
]
312+
},
313+
{
314+
"cell_type": "code",
315+
"execution_count": 10,
316+
"metadata": {},
317+
"outputs": [],
318+
"source": [
319+
"# # Save the weights of the model\n",
320+
"# model.save_weights('model_updated_weight_ai.weights.h5')\n",
321+
"\n",
322+
"# Save the weights of the model\n",
323+
"model.save_weights('models/model_updated_weight_ai.weights.h5')\n",
324+
"\n"
325+
]
326+
}
327+
],
328+
"metadata": {
329+
"colab": {
330+
"provenance": []
331+
},
332+
"kernelspec": {
333+
"display_name": "Python 3 (ipykernel)",
334+
"language": "python",
335+
"name": "python3"
336+
},
337+
"language_info": {
338+
"codemirror_mode": {
339+
"name": "ipython",
340+
"version": 3
341+
},
342+
"file_extension": ".py",
343+
"mimetype": "text/x-python",
344+
"name": "python",
345+
"nbconvert_exporter": "python",
346+
"pygments_lexer": "ipython3",
347+
"version": "3.12.2"
348+
}
349+
},
350+
"nbformat": 4,
351+
"nbformat_minor": 4
352+
}
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)