Merge pull request #319 from MaithreyiA/patch-1

kishanrajput23 · web-flow · commit 34dbf74abdfb · 2023-11-21T23:35:52.000+05:30
Create document_classifier.ipynb
diff --git a/document_classifier.ipynb b/document_classifier.ipynb
@@ -0,0 +1,109 @@
+import numpy as np # linear algebra
+import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
+
+import os
+for dirname, _, filenames in os.walk('/kaggle/input'):
+    for filename in filenames:
+        print(os.path.join(dirname, filename))
+
+import os
+import chardet
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import StratifiedKFold
+from sklearn.metrics import accuracy_score
+import fitz
+
+# Step 1: Prepare the dataset
+root_dir = '/kaggle/input/subject'  # Update this with the correct path to your 'subject' directory
+categories = ['History', 'chemistry', 'economics', 'english', 'enterainment', 'geography', 'maths', 'physics', 'political science', 'science']
+
+# Step 2: Load and preprocess the text data
+documents = []
+labels = []
+
+for i, category in enumerate(categories):
+    folder_path = os.path.join(root_dir, category)
+    for filename in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, filename)
+        try:
+            # Read PDF content using PyMuPDF (fitz)
+            with fitz.open(file_path) as pdf_document:
+                text = ''
+                for page_num in range(pdf_document.page_count):
+                    page = pdf_document[page_num]
+                    text += page.get_text()
+                documents.append(text)
+                labels.append(i)
+        except Exception as e:
+            print(f"Error reading file: {file_path}. Skipping this file. Error: {e}")
+            
+# Step 3: Train the classifier
+tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
+X_tfidf = tfidf_vectorizer.fit_transform(documents)
+label_encoder = LabelEncoder()
+y_encoded = label_encoder.fit_transform(labels)
+
+classifier = MultinomialNB()
+classifier.fit(X_tfidf, y_encoded)
+
+# Define the number of folds for cross-validation
+from sklearn.model_selection import KFold
+kf = KFold(n_splits=10, shuffle=True, random_state=42)
+accuracies = []
+for train_idx, test_idx in kf.split(X_tfidf, y_encoded):
+    X_train_fold, X_test_fold = X_tfidf[train_idx], X_tfidf[test_idx]
+    y_train_fold, y_test_fold = y_encoded[train_idx], y_encoded[test_idx]
+    
+    classifier = MultinomialNB()
+    classifier.fit(X_train_fold, y_train_fold)
+    
+    y_pred_fold = classifier.predict(X_test_fold)
+    accuracy_fold = accuracy_score(y_test_fold, y_pred_fold)
+    fold_accuracy_scores.append(accuracy_fold)
+
+# Calculate and print average accuracy across folds
+average_accuracy = sum(fold_accuracy_scores) / num_folds
+print(f"AVERAGE ACCURACY ACROSS {num_folds} FOLDS: {average_accuracy:.2f}")
+
+# Step 4: Use the trained model for inference
+def classify_document(file_path):
+    try:
+        with fitz.open(file_path) as pdf_document:
+            text = ""
+            for page_num in range(pdf_document.page_count):
+                page = pdf_document[page_num]
+                text += page.get_text()
+        
+        # Preprocess the text and predict category with Laplace smoothing
+        text_tfidf = tfidf_vectorizer.transform([text])
+        predicted_category_encoded = classifier.predict(text_tfidf)[0]  # Access the first element directly
+        
+        # Check if the predicted category code is within the expected range
+        if 0 <= predicted_category_encoded < len(categories):
+            predicted_category = label_encoder.classes_[predicted_category_encoded]
+        else:
+            predicted_category = "Unknown"
+        
+        return predicted_category, predicted_category_encoded
+    except Exception as e:
+        return f"Error classifying the document. Error: {e}"
+
+category_mapping = {i: category for i, category in enumerate(categories)}
+print("Category Mapping:")
+for code, category_name in category_mapping.items():
+    print(f"Numeric Code: {code}, Category: {category_name}")
+import os
+test_data_folder = '/kaggle/input/tests-data'
+test_files = [os.path.join(test_data_folder, file) for file in os.listdir(test_data_folder) if file.endswith('.pdf')]
+
+# Loop through each test file and predict its category
+for test_document_path in test_files:
+    predicted_category, predicted_category_encoded = classify_document(test_document_path)
+    predicted_name = category_names[predicted_category]
+    
+    print(f"Predicted Category Name: {predicted_name}")
+    print(f"Predicted Category: {predicted_category}")
+    print("------")