|
| 1 | +import numpy as np # linear algebra |
| 2 | +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) |
| 3 | + |
| 4 | +import os |
| 5 | +for dirname, _, filenames in os.walk('/kaggle/input'): |
| 6 | + for filename in filenames: |
| 7 | + print(os.path.join(dirname, filename)) |
| 8 | + |
| 9 | +import os |
| 10 | +import chardet |
| 11 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 12 | +from sklearn.naive_bayes import MultinomialNB |
| 13 | +from sklearn.pipeline import Pipeline |
| 14 | +from sklearn.preprocessing import LabelEncoder |
| 15 | +from sklearn.model_selection import StratifiedKFold |
| 16 | +from sklearn.metrics import accuracy_score |
| 17 | +import fitz |
| 18 | + |
| 19 | +# Step 1: Prepare the dataset |
| 20 | +root_dir = '/kaggle/input/subject' # Update this with the correct path to your 'subject' directory |
| 21 | +categories = ['History', 'chemistry', 'economics', 'english', 'enterainment', 'geography', 'maths', 'physics', 'political science', 'science'] |
| 22 | + |
| 23 | +# Step 2: Load and preprocess the text data |
| 24 | +documents = [] |
| 25 | +labels = [] |
| 26 | + |
| 27 | +for i, category in enumerate(categories): |
| 28 | + folder_path = os.path.join(root_dir, category) |
| 29 | + for filename in os.listdir(folder_path): |
| 30 | + file_path = os.path.join(folder_path, filename) |
| 31 | + try: |
| 32 | + # Read PDF content using PyMuPDF (fitz) |
| 33 | + with fitz.open(file_path) as pdf_document: |
| 34 | + text = '' |
| 35 | + for page_num in range(pdf_document.page_count): |
| 36 | + page = pdf_document[page_num] |
| 37 | + text += page.get_text() |
| 38 | + documents.append(text) |
| 39 | + labels.append(i) |
| 40 | + except Exception as e: |
| 41 | + print(f"Error reading file: {file_path}. Skipping this file. Error: {e}") |
| 42 | + |
| 43 | +# Step 3: Train the classifier |
| 44 | +tfidf_vectorizer = TfidfVectorizer(max_features=5000) |
| 45 | +X_tfidf = tfidf_vectorizer.fit_transform(documents) |
| 46 | +label_encoder = LabelEncoder() |
| 47 | +y_encoded = label_encoder.fit_transform(labels) |
| 48 | + |
| 49 | +classifier = MultinomialNB() |
| 50 | +classifier.fit(X_tfidf, y_encoded) |
| 51 | + |
| 52 | +# Define the number of folds for cross-validation |
| 53 | +from sklearn.model_selection import KFold |
| 54 | +kf = KFold(n_splits=10, shuffle=True, random_state=42) |
| 55 | +accuracies = [] |
| 56 | +for train_idx, test_idx in kf.split(X_tfidf, y_encoded): |
| 57 | + X_train_fold, X_test_fold = X_tfidf[train_idx], X_tfidf[test_idx] |
| 58 | + y_train_fold, y_test_fold = y_encoded[train_idx], y_encoded[test_idx] |
| 59 | + |
| 60 | + classifier = MultinomialNB() |
| 61 | + classifier.fit(X_train_fold, y_train_fold) |
| 62 | + |
| 63 | + y_pred_fold = classifier.predict(X_test_fold) |
| 64 | + accuracy_fold = accuracy_score(y_test_fold, y_pred_fold) |
| 65 | + fold_accuracy_scores.append(accuracy_fold) |
| 66 | + |
| 67 | +# Calculate and print average accuracy across folds |
| 68 | +average_accuracy = sum(fold_accuracy_scores) / num_folds |
| 69 | +print(f"AVERAGE ACCURACY ACROSS {num_folds} FOLDS: {average_accuracy:.2f}") |
| 70 | + |
| 71 | +# Step 4: Use the trained model for inference |
| 72 | +def classify_document(file_path): |
| 73 | + try: |
| 74 | + with fitz.open(file_path) as pdf_document: |
| 75 | + text = "" |
| 76 | + for page_num in range(pdf_document.page_count): |
| 77 | + page = pdf_document[page_num] |
| 78 | + text += page.get_text() |
| 79 | + |
| 80 | + # Preprocess the text and predict category with Laplace smoothing |
| 81 | + text_tfidf = tfidf_vectorizer.transform([text]) |
| 82 | + predicted_category_encoded = classifier.predict(text_tfidf)[0] # Access the first element directly |
| 83 | + |
| 84 | + # Check if the predicted category code is within the expected range |
| 85 | + if 0 <= predicted_category_encoded < len(categories): |
| 86 | + predicted_category = label_encoder.classes_[predicted_category_encoded] |
| 87 | + else: |
| 88 | + predicted_category = "Unknown" |
| 89 | + |
| 90 | + return predicted_category, predicted_category_encoded |
| 91 | + except Exception as e: |
| 92 | + return f"Error classifying the document. Error: {e}" |
| 93 | + |
| 94 | +category_mapping = {i: category for i, category in enumerate(categories)} |
| 95 | +print("Category Mapping:") |
| 96 | +for code, category_name in category_mapping.items(): |
| 97 | + print(f"Numeric Code: {code}, Category: {category_name}") |
| 98 | +import os |
| 99 | +test_data_folder = '/kaggle/input/tests-data' |
| 100 | +test_files = [os.path.join(test_data_folder, file) for file in os.listdir(test_data_folder) if file.endswith('.pdf')] |
| 101 | + |
| 102 | +# Loop through each test file and predict its category |
| 103 | +for test_document_path in test_files: |
| 104 | + predicted_category, predicted_category_encoded = classify_document(test_document_path) |
| 105 | + predicted_name = category_names[predicted_category] |
| 106 | + |
| 107 | + print(f"Predicted Category Name: {predicted_name}") |
| 108 | + print(f"Predicted Category: {predicted_category}") |
| 109 | + print("------") |
0 commit comments