small changes

adamspd · adamspd · commit 906eda66b421 · 2023-06-19T10:20:42.000+02:00
diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ The more data you have, the better the models will perform.
 To train the models, run the following command:
 
 ```sh
-python trainer.py
+python3 spam_detector_ai/trainer.py
 ```
 
 This will train all the models and save them to the `models` directory. For now, there is 3 models:
@@ -66,9 +66,9 @@ This will train all the models and save them to the `models` directory. For now,
 
 ### Tests
 
-The tests results are shown below:
+The test results are shown below:
 
-#### <u>Model: NAIVE_BAYES</u>
+#### _Model: NAIVE_BAYES_
 
 ##### Confusion Matrix:
 
@@ -96,7 +96,7 @@ The tests results are shown below:
 
 <br>
 
-#### <u>Model: RANDOM_FOREST</u>
+#### _Model: RANDOM_FOREST_
 
 ##### Confusion Matrix:
 
@@ -124,7 +124,7 @@ The tests results are shown below:
 
 <br>
 
-#### <u>Model: SVM</u>
+#### _Model: SVM_
 
 ##### Confusion Matrix:
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,6 @@
 scikit-learn~=1.2.2
 imblearn~=0.0
 pandas~=2.0.2
-nltk~=3.8.1
+nltk~=3.8.1
+setuptools~=67.8.0
+pytest~=7.3.2
diff --git a/spam_detector_ai/loading_and_processing/data_loader.py b/spam_detector_ai/loading_and_processing/data_loader.py
@@ -1,4 +1,4 @@
-# data_loader.py
+# spam_detector_ai/loading_and_processing/data_loader.py
 
 import pandas as pd
 
diff --git a/spam_detector_ai/loading_and_processing/preprocessor.py b/spam_detector_ai/loading_and_processing/preprocessor.py
@@ -1,4 +1,4 @@
-# preprocessor.py
+# spam_detector_ai/loading_and_processing/preprocessor.py
 
 import re
 
diff --git a/spam_detector_ai/prediction/predict.py b/spam_detector_ai/prediction/predict.py
@@ -1,4 +1,9 @@
 # spam_detector_ai/prediction/predict.py
+"""
+Author: Adams P. David
+Contact: https://adamspierredavid.com/contact/
+Date Written: 2023-06-12
+"""
 
 import os
 import pickle
@@ -41,6 +46,8 @@ def get_model_path(model_type):
 
 
 class SpamDetector:
+    """This class is used to detect whether a message is spam or not spam."""
+
     def __init__(self, model_type=ClassifierType.NAIVE_BAYES):
         # Determine paths based on model's type
         model_path, vectoriser_path = get_model_path(model_type)
@@ -79,6 +86,9 @@ def test_is_spam(self, message_):
 
 
 class VotingSpamDetector:
+    """This class is used to detect whether a message is spam
+    or not spam using majority voting of multiple spam detectors models."""
+
     def __init__(self):
         self.detectors = [
             SpamDetector(model_type=ClassifierType.NAIVE_BAYES),
diff --git a/spam_detector_ai/test_and_tuning/__init__.py b/spam_detector_ai/test_and_tuning/__init__.py
@@ -1 +1,2 @@
-from .test import TestModel
+from .test import TestModel
+from .py_test import TestClassifiers
diff --git a/spam_detector_ai/test_and_tuning/fine_tuning_svm.py b/spam_detector_ai/test_and_tuning/fine_tuning_svm.py
@@ -1,3 +1,5 @@
+# spam_detector_ai/test_and_tuning/fine_tuning_svm.py
+
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import SVC
diff --git a/spam_detector_ai/test_and_tuning/py_test.py b/spam_detector_ai/test_and_tuning/py_test.py
@@ -0,0 +1,31 @@
+import os
+import pytest
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from spam_detector_ai.classifiers.classifier_types import ClassifierType
+from spam_detector_ai.logger_config import init_logging
+from spam_detector_ai.prediction import SpamDetector
+from spam_detector_ai.training import ModelTrainer
+
+
+@pytest.fixture(scope="module")
+def test_model():
+    classifier_types = [ClassifierType.NAIVE_BAYES, ClassifierType.RANDOM_FOREST, ClassifierType.SVM]
+    logger = init_logging()
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    base_dir = os.path.dirname(current_dir)
+    data_path = os.path.join(base_dir, 'data/spam.csv')
+    initial_trainer = ModelTrainer(data_path=data_path, classifier_type=None, logger=logger)
+    processed_data = initial_trainer._preprocess_data()
+    _, X_test, _, y_test = train_test_split(processed_data['processed_text'], processed_data['label'],
+                                            test_size=0.2, random_state=0)
+    return classifier_types, X_test, y_test
+
+
+class TestClassifiers:
+    def test_classifier_accuracy(self, test_model):
+        classifier_types, X_test, y_test = test_model
+        for ct in classifier_types:
+            detector = SpamDetector(model_type=ct)
+            y_pred = [detector.test_is_spam(message) for message in X_test]
+            assert accuracy_score(y_test, y_pred) > 0.85

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# data_loader.py`
	`1`	`+# spam_detector_ai/loading_and_processing/data_loader.py`
`2`	`2`
`3`	`3`	`import pandas as pd`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# preprocessor.py`
	`1`	`+# spam_detector_ai/loading_and_processing/preprocessor.py`
`2`	`2`
`3`	`3`	`import re`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-from .test import TestModel`
	`1`	`+from .test import TestModel`
	`2`	`+from .py_test import TestClassifiers`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+# spam_detector_ai/test_and_tuning/fine_tuning_svm.py`
	`2`	`+`
`1`	`3`	`from sklearn.feature_extraction.text import TfidfVectorizer`
`2`	`4`	`from sklearn.model_selection import GridSearchCV`
`3`	`5`	`from sklearn.svm import SVC`