Merge pull request #74 from stefanDeveloper/documentation/add-docstrings

stefanDeveloper · web-flow · commit 5df4172a6c1a · 2025-04-22T13:33:58.000Z
Update missing docstrings or docstrings with too few information
diff --git a/src/base/clickhouse_kafka_sender.py b/src/base/clickhouse_kafka_sender.py
@@ -27,7 +27,12 @@ def __init__(self, table_name: str):
         )()
 
     def insert(self, data: dict):
-        """Produces the insert operation to Kafka."""
+        """
+        Produces the insert operation to Kafka.
+
+        Args:
+            data (dict): content to write into the Kafka queue
+        """
         self.kafka_producer.produce(
             topic=f"clickhouse_{self.table_name}",
             data=self.data_schema.dumps(data),
diff --git a/src/base/data_classes/batch.py b/src/base/data_classes/batch.py
@@ -8,6 +8,10 @@
 
 @dataclass
 class Batch:
+    """
+    Class definition of a batch, used to divide the log input into smaller amounts
+    """
+
     batch_id: uuid.UUID = field(
         metadata={"marshmallow_field": marshmallow.fields.UUID()}
     )
diff --git a/src/base/kafka_handler.py b/src/base/kafka_handler.py
@@ -284,7 +284,15 @@ def consume_as_json(self) -> tuple[None | str, dict]:
         except Exception:
             raise ValueError("Unknown data format")
 
-    def _all_topics_created(self, topics):
+    def _all_topics_created(self, topics) -> bool:
+        """
+        Checks whether each topic in a list of topics was created. If not, retries for a set amount of times
+
+        Args:
+            topics (list): List of topics to check
+        Returns:
+            bool
+        """
         number_of_retries_left = 30
         all_topics_created = False
         while not all_topics_created:  # try for 15 seconds
diff --git a/src/detector/detector.py b/src/detector/detector.py
@@ -299,6 +299,7 @@ def calculate_entropy(s: str) -> float:
         return all_features.reshape(1, -1)
 
     def detect(self) -> None:  # pragma: no cover
+        """Method to detect malicious requests in the network flows"""
         logger.info("Start detecting malicious requests.")
         for message in self.messages:
             # TODO predict all messages
@@ -317,6 +318,7 @@ def detect(self) -> None:  # pragma: no cover
                 self.warnings.append(warning)
 
     def send_warning(self) -> None:
+        """Dispatch warnings saved to the object's warning list"""
         logger.info("Store alert.")
         if len(self.warnings) > 0:
             overall_score = median(
diff --git a/src/inspector/inspector.py b/src/inspector/inspector.py
@@ -336,10 +336,20 @@ def inspect(self):
                 raise NotImplementedError(f"Mode {MODE} is not supported!")
 
     def _inspect_multivariate(self, model: str):
+        """
+        Method to inspect multivariate data for anomalies using a StreamAD Model
+        Errors are count in the time window and fit model to retrieve scores.
+
+        Args:
+            model (str): Model name (should be capable of handling multivariate data)
+
+        """
         logger.debug(f"Load Model: {model['model']} from {model['module']}.")
         if not model["model"] in VALID_MULTIVARIATE_MODELS:
-            logger.error(f"Model {model} is not a valid univariate model.")
-            raise NotImplementedError(f"Model {model} is not a valid univariate model.")
+            logger.error(f"Model {model} is not a valid multivariate model.")
+            raise NotImplementedError(
+                f"Model {model} is not a valid multivariate model."
+            )
 
         module = importlib.import_module(model["module"])
         module_model = getattr(module, model["model"])
@@ -367,11 +377,19 @@ def _inspect_multivariate(self, model: str):
                 self.anomalies.append(0)
 
     def _inspect_ensemble(self, models: str):
+        """
+        Method to inspect data for anomalies using ensembles of two StreamAD models
+        Errors are count in the time window and fit model to retrieve scores.
+
+        Args:
+            model (str): Model name (should be a valid ensemble modle)
+
+        """
         logger.debug(f"Load Model: {ENSEMBLE['model']} from {ENSEMBLE['module']}.")
         if not ENSEMBLE["model"] in VALID_ENSEMBLE_MODELS:
-            logger.error(f"Model {ENSEMBLE} is not a valid univariate model.")
+            logger.error(f"Model {ENSEMBLE} is not a valid ensemble model.")
             raise NotImplementedError(
-                f"Model {ENSEMBLE} is not a valid univariate model."
+                f"Model {ENSEMBLE} is not a valid ensemble model."
             )
 
         module = importlib.import_module(ENSEMBLE["module"])
@@ -389,9 +407,9 @@ def _inspect_ensemble(self, models: str):
         for model in models:
             logger.debug(f"Load Model: {model['model']} from {model['module']}.")
             if not model["model"] in VALID_UNIVARIATE_MODELS:
-                logger.error(f"Model {models} is not a valid univariate model.")
+                logger.error(f"Model {models} is not a valid ensemble model.")
                 raise NotImplementedError(
-                    f"Model {models} is not a valid univariate model."
+                    f"Model {models} is not a valid ensemble model."
                 )
 
             module = importlib.import_module(model["module"])
@@ -415,8 +433,7 @@ def _inspect_univariate(self, model: str):
         Errors are count in the time window and fit model to retrieve scores.
 
         Args:
-            model (BaseDetector): StreamAD model.
-            model_args (dict): Arguments passed to the StreamAD model.
+            model (str): StreamAD model name.
         """
 
         logger.debug(f"Load Model: {model['model']} from {model['module']}.")
@@ -445,6 +462,7 @@ def _inspect_univariate(self, model: str):
                 self.anomalies.append(0)
 
     def send_data(self):
+        """Pass the anomalous data for the detector unit for further processing"""
         total_anomalies = np.count_nonzero(
             np.greater_equal(np.array(self.anomalies), SCORE_THRESHOLD)
         )
diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py
@@ -406,6 +406,12 @@ def add_message(self, key: str, message: str) -> None:
             self._reset_timer()
 
     def _send_all_batches(self, reset_timer: bool = True) -> None:
+        """
+        Dispatch all batches for the Kafka queue
+
+        Args:
+            reset_timer (bool): whether or not the timer should be reset
+        """
         number_of_keys = 0
         total_number_of_batch_messages = self.batch.get_message_count_for_batch()
         total_number_of_buffer_messages = self.batch.get_message_count_for_buffer()
@@ -438,6 +444,12 @@ def _send_all_batches(self, reset_timer: bool = True) -> None:
             )
 
     def _send_batch_for_key(self, key: str) -> None:
+        """
+        Send one batch based on the key
+
+        Args:
+            key (str): Key to identify the batch
+        """
         try:
             data = self.batch.complete_batch(key)
         except ValueError as e:
@@ -447,6 +459,13 @@ def _send_batch_for_key(self, key: str) -> None:
         self._send_data_packet(key, data)
 
     def _send_data_packet(self, key: str, data: dict) -> None:
+        """
+        Sends a packet of a batch to the defined Kafka topic
+
+        Args:
+            key (str): key to identify the batch
+            data (dict): the batch data to send
+        """
         batch_schema = marshmallow_dataclass.class_schema(Batch)()
 
         self.kafka_produce_handler.produce(
@@ -456,6 +475,7 @@ def _send_data_packet(self, key: str, data: dict) -> None:
         )
 
     def _reset_timer(self) -> None:
+        """Restarts the internal timer of the object"""
         if self.timer:
             self.timer.cancel()
 
diff --git a/src/monitoring/clickhouse_batch_sender.py b/src/monitoring/clickhouse_batch_sender.py
@@ -32,6 +32,12 @@ class Table:
     columns: dict[str, type]
 
     def verify(self, data: dict[str, Any]):
+        """
+        Verify if the data has the correct columns and types.
+
+        Args:
+            data (dict): The values for each cell
+        """
         if len(data) != len(self.columns):
             raise ValueError(
                 f"Wrong number of fields in data: Expected {len(self.columns)}, got {len(data)}"
@@ -182,7 +188,14 @@ def __del__(self):
         self.insert_all()
 
     def add(self, table_name: str, data: dict[str, Any]):
-        """Adds the data to the batch for the table. Verifies the fields first."""
+        """
+        Adds the data to the batch for the table. Verifies the fields first.
+
+        Args:
+            table_name (str): Name of the table to add data to
+            data (dict): The values for each cell in the table
+
+        """
         self.tables.get(table_name).verify(data)
         self.batch.get(table_name).append(list(data.values()))
 
@@ -193,7 +206,12 @@ def add(self, table_name: str, data: dict[str, Any]):
             self._start_timer()
 
     def insert(self, table_name: str):
-        """Inserts the batch for the given table."""
+        """
+        Inserts the batch for the given table.
+
+        Args:
+            table_name (str): Name of the table to insert data to
+        """
         if self.batch[table_name]:
             with self.lock:
                 self._client.insert(
@@ -217,6 +235,7 @@ def insert_all(self):
         self.timer = None
 
     def _start_timer(self):
+        """Set the timer for batch processing of data insertion"""
         if self.timer:
             self.timer.cancel()
 
diff --git a/src/train/model.py b/src/train/model.py
@@ -165,6 +165,16 @@ def __init__(
         super().__init__(processor, x_train, y_train)
 
     def fdr_metric(self, preds: np.ndarray, dtrain: xgb.DMatrix) -> tuple[str, float]:
+        """
+        Custom FDR metric to evaluate model performance based on False Discovery Rate.
+
+        Args:
+            preds (np.ndarray): The predicted values.
+            dtrain (xgb.DMatrix): The training data matrix.
+
+        Returns:
+            tuple: A tuple containing the metric name ("fdr") and its value.
+        """
         # Get the true labels
         labels = dtrain.get_label()
 
@@ -188,6 +198,15 @@ def fdr_metric(self, preds: np.ndarray, dtrain: xgb.DMatrix) -> tuple[str, float
         )  # -1 is essentiell since XGBoost wants a scoring value (higher is better). However, FDR represents a loss function.
 
     def objective(self, trial):
+        """
+        Optimizes the XGBoost model hyperparameters using cross-validation.
+
+        Args:
+            trial: A trial object from the optimization framework (e.g., Optuna).
+
+        Returns:
+            float: The best FDR value after cross-validation.
+        """
         dtrain = xgb.DMatrix(self.x_train, label=self.y_train)
 
         param = {
@@ -263,6 +282,13 @@ def predict(self, x):
         return self.clf.predict(x)
 
     def train(self, trial, output_path):
+        """
+        Trains the XGBoost model and saves the trained model to a file.
+
+        Args:
+            trial: A trial object from the optimization framework.
+            output_path (str): The directory path to save the trained model.
+        """
         logger.info("Number of estimators: {}".format(trial.user_attrs["n_estimators"]))
 
         # dtrain = xgb.DMatrix(self.x_train, label=self.y_train)
@@ -300,6 +326,16 @@ def __init__(
 
     # Define the custom FDR metric
     def fdr_metric(self, y_true: np.ndarray, y_pred: np.ndarray):
+        """
+        Custom FDR metric to evaluate the performance of the Random Forest model.
+
+        Args:
+            y_true (np.ndarray): The true labels.
+            y_pred (np.ndarray): The predicted labels.
+
+        Returns:
+            float: The False Discovery Rate (FDR).
+        """
         # False Positives (FP): cases where the model predicted 1 but the actual label is 0
         FP = np.sum((y_pred == 1) & (y_true == 0))
 
@@ -315,6 +351,15 @@ def fdr_metric(self, y_true: np.ndarray, y_pred: np.ndarray):
         return fdr
 
     def objective(self, trial):
+        """
+        Optimizes the Random Forest model hyperparameters using cross-validation.
+
+        Args:
+            trial: A trial object from the optimization framework (e.g., Optuna).
+
+        Returns:
+            float: The best FDR value after cross-validation.
+        """
         # Define hyperparameters to optimize
         n_estimators = trial.suggest_int("n_estimators", 50, 300)
         max_depth = trial.suggest_int("max_depth", 2, 20)
@@ -359,6 +404,13 @@ def predict(self, x):
         return self.clf.predict(x)
 
     def train(self, trial, output_path):
+        """
+        Trains the Random Forest model and saves the trained model to a file.
+
+        Args:
+            trial: A trial object from the optimization framework.
+            output_path (str): The directory path to save the trained model.
+        """
         self.clf = RandomForestClassifier(**trial.params)
         self.clf.fit(self.x_train, self.y_train)