Skip to content

Commit 34dbf74

Browse files
Merge pull request #319 from MaithreyiA/patch-1
Create document_classifier.ipynb
2 parents 707ce11 + 7c3420f commit 34dbf74

File tree

1 file changed

+109
-0
lines changed

1 file changed

+109
-0
lines changed

document_classifier.ipynb

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import numpy as np # linear algebra
2+
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3+
4+
import os
5+
for dirname, _, filenames in os.walk('/kaggle/input'):
6+
for filename in filenames:
7+
print(os.path.join(dirname, filename))
8+
9+
import os
10+
import chardet
11+
from sklearn.feature_extraction.text import TfidfVectorizer
12+
from sklearn.naive_bayes import MultinomialNB
13+
from sklearn.pipeline import Pipeline
14+
from sklearn.preprocessing import LabelEncoder
15+
from sklearn.model_selection import StratifiedKFold
16+
from sklearn.metrics import accuracy_score
17+
import fitz
18+
19+
# Step 1: Prepare the dataset
20+
root_dir = '/kaggle/input/subject' # Update this with the correct path to your 'subject' directory
21+
categories = ['History', 'chemistry', 'economics', 'english', 'enterainment', 'geography', 'maths', 'physics', 'political science', 'science']
22+
23+
# Step 2: Load and preprocess the text data
24+
documents = []
25+
labels = []
26+
27+
for i, category in enumerate(categories):
28+
folder_path = os.path.join(root_dir, category)
29+
for filename in os.listdir(folder_path):
30+
file_path = os.path.join(folder_path, filename)
31+
try:
32+
# Read PDF content using PyMuPDF (fitz)
33+
with fitz.open(file_path) as pdf_document:
34+
text = ''
35+
for page_num in range(pdf_document.page_count):
36+
page = pdf_document[page_num]
37+
text += page.get_text()
38+
documents.append(text)
39+
labels.append(i)
40+
except Exception as e:
41+
print(f"Error reading file: {file_path}. Skipping this file. Error: {e}")
42+
43+
# Step 3: Train the classifier
44+
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
45+
X_tfidf = tfidf_vectorizer.fit_transform(documents)
46+
label_encoder = LabelEncoder()
47+
y_encoded = label_encoder.fit_transform(labels)
48+
49+
classifier = MultinomialNB()
50+
classifier.fit(X_tfidf, y_encoded)
51+
52+
# Define the number of folds for cross-validation
53+
from sklearn.model_selection import KFold
54+
kf = KFold(n_splits=10, shuffle=True, random_state=42)
55+
accuracies = []
56+
for train_idx, test_idx in kf.split(X_tfidf, y_encoded):
57+
X_train_fold, X_test_fold = X_tfidf[train_idx], X_tfidf[test_idx]
58+
y_train_fold, y_test_fold = y_encoded[train_idx], y_encoded[test_idx]
59+
60+
classifier = MultinomialNB()
61+
classifier.fit(X_train_fold, y_train_fold)
62+
63+
y_pred_fold = classifier.predict(X_test_fold)
64+
accuracy_fold = accuracy_score(y_test_fold, y_pred_fold)
65+
fold_accuracy_scores.append(accuracy_fold)
66+
67+
# Calculate and print average accuracy across folds
68+
average_accuracy = sum(fold_accuracy_scores) / num_folds
69+
print(f"AVERAGE ACCURACY ACROSS {num_folds} FOLDS: {average_accuracy:.2f}")
70+
71+
# Step 4: Use the trained model for inference
72+
def classify_document(file_path):
73+
try:
74+
with fitz.open(file_path) as pdf_document:
75+
text = ""
76+
for page_num in range(pdf_document.page_count):
77+
page = pdf_document[page_num]
78+
text += page.get_text()
79+
80+
# Preprocess the text and predict category with Laplace smoothing
81+
text_tfidf = tfidf_vectorizer.transform([text])
82+
predicted_category_encoded = classifier.predict(text_tfidf)[0] # Access the first element directly
83+
84+
# Check if the predicted category code is within the expected range
85+
if 0 <= predicted_category_encoded < len(categories):
86+
predicted_category = label_encoder.classes_[predicted_category_encoded]
87+
else:
88+
predicted_category = "Unknown"
89+
90+
return predicted_category, predicted_category_encoded
91+
except Exception as e:
92+
return f"Error classifying the document. Error: {e}"
93+
94+
category_mapping = {i: category for i, category in enumerate(categories)}
95+
print("Category Mapping:")
96+
for code, category_name in category_mapping.items():
97+
print(f"Numeric Code: {code}, Category: {category_name}")
98+
import os
99+
test_data_folder = '/kaggle/input/tests-data'
100+
test_files = [os.path.join(test_data_folder, file) for file in os.listdir(test_data_folder) if file.endswith('.pdf')]
101+
102+
# Loop through each test file and predict its category
103+
for test_document_path in test_files:
104+
predicted_category, predicted_category_encoded = classify_document(test_document_path)
105+
predicted_name = category_names[predicted_category]
106+
107+
print(f"Predicted Category Name: {predicted_name}")
108+
print(f"Predicted Category: {predicted_category}")
109+
print("------")

0 commit comments

Comments
 (0)