Add/batch operation (#146)

AHReccese · web-flow · commit 57efefd37a80 · 2024-11-14T22:28:02.000+03:30
* draft of `batch_export` function added to `Export`

* draft of `batch_import` function added to `Import`

* testcase for batch operation added

* `CHANGELOG.md` updated

* `autopep8.sh` run

* fix index bug

* apply `Codacy` feedback

* feedback applied + `autopep8.sh` executed

* lowercasing the first letter

* `CHANGELOG.md` updated
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 ### Added
+- batch operation testcases
+- `batch_export` function in `pymilo/pymilo_obj.py` 
+- `batch_import` function in `pymilo/pymilo_obj.py`
 - `CCA` model
 - `PLSCanonical` model
 - `PLSRegression` model
diff --git a/pymilo/pymilo_obj.py b/pymilo/pymilo_obj.py
@@ -1,14 +1,18 @@
 # -*- coding: utf-8 -*-
 """PyMilo modules."""
+import os
+import re
 import json
 from copy import deepcopy
 from warnings import warn
 from traceback import format_exc
 from .utils.util import get_sklearn_type, download_model
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from .pymilo_func import get_sklearn_data, get_sklearn_version, to_sklearn_model
 from .exceptions.serialize_exception import PymiloSerializationException, SerializationErrorTypes
 from .exceptions.deserialize_exception import PymiloDeserializationException, DeserializationErrorTypes
-from .pymilo_param import PYMILO_VERSION, UNEQUAL_PYMILO_VERSIONS, UNEQUAL_SKLEARN_VERSIONS, INVALID_IMPORT_INIT_PARAMS
+from .pymilo_param import PYMILO_VERSION, UNEQUAL_PYMILO_VERSIONS, UNEQUAL_SKLEARN_VERSIONS
+from .pymilo_param import INVALID_IMPORT_INIT_PARAMS, BATCH_IMPORT_INVALID_DIRECTORY
 
 
 class Export:
@@ -73,6 +77,45 @@ def to_json(self):
                         "model_type": self.type},
                 })
 
+    @staticmethod
+    def batch_export(models, file_addr, run_parallel=False):
+        """
+        Export a batch of models to individual JSON files in a specified directory.
+
+        This method takes a list of trained models and exports each one into a JSON file. The models
+        are exported concurrently using multiple threads, where each model is saved to a file named
+        'model_{index}.json' in the provided directory.
+
+        :param models: list of models to get exported.
+        :type models: list
+        :param file_addr: the directory where exported JSON files will be saved.
+        :type file_addr: str
+        :param run_parallel: flag indicating the parallel execution of exports
+        :type run_parallel: boolean
+        :return: the count of models exported successfully
+        """
+        if not os.path.exists(file_addr):
+            os.mkdir(file_addr)
+
+        def export_model(model, index):
+            try:
+                Export(model).save(file_adr=os.path.join(file_addr, f"model_{index}.json"))
+                return 1
+            except Exception as _:
+                return 0
+        if run_parallel:
+            with ThreadPoolExecutor() as executor:
+                futures = [executor.submit(export_model, model, index) for index, model in enumerate(models)]
+                count = 0
+                for future in as_completed(futures):
+                    count += future.result()
+                return count
+        else:
+            count = 0
+            for index, model in enumerate(models):
+                count += export_model(model, index)
+            return count
+
 
 class Import:
     """
@@ -138,3 +181,61 @@ def to_model(self):
         :return: sklearn model
         """
         return to_sklearn_model(self)
+
+    @staticmethod
+    def batch_import(file_addr, run_parallel=False):
+        """
+        Import a batch of models from individual JSON files in a specified directory.
+
+        This method takes a directory containing JSON files and imports each one into a model.
+        The models are imported concurrently using multiple threads, ensuring that the files are
+        processed in the order determined by their numeric suffixes. The function returns the
+        successfully imported models in the same order as their filenames.
+
+        :param file_addr: the directory where the JSON files to be imported are located.
+        :type file_addr: str
+        :param run_parallel: flag indicating the parallel execution of imports
+        :type run_parallel: boolean
+        :return: a tuple containing the count of models imported successfully and a list of the
+                imported models in their filename order.
+        """
+        if not os.path.exists(file_addr):
+            raise FileNotFoundError(BATCH_IMPORT_INVALID_DIRECTORY)
+
+        json_files = [f for f in os.listdir(file_addr) if f.endswith('.json')]
+        json_files.sort(key=lambda x: int(re.search(r'_(\d+)\.json$', x).group(1)))
+
+        models = [None] * len(json_files)
+        count = 0
+
+        def import_model(file_path, index):
+            try:
+                model = Import(file_path).to_model()
+                return index, model
+            except Exception as _:
+                return index, None
+
+        if run_parallel:
+            with ThreadPoolExecutor() as executor:
+                futures = {
+                    executor.submit(
+                        import_model,
+                        os.path.join(
+                            file_addr,
+                            file),
+                        index): index for index,
+                    file in enumerate(json_files)}
+                for future in as_completed(futures):
+                    index, model = future.result()
+                    if model is not None:
+                        models[index] = model
+                        count += 1
+                return count, models
+        else:
+            count = 0
+            for index, file in enumerate(json_files):
+                model = Import(os.path.join(file_addr, file)).to_model()
+                if model is not None:
+                    models[index] = model
+                    count += 1
+            return count, models
diff --git a/pymilo/pymilo_param.py b/pymilo/pymilo_param.py
@@ -91,7 +91,7 @@
 INVALID_IMPORT_INIT_PARAMS = "Invalid input parameters, you should either pass a valid file_adr or a json_dump or a url to initiate Import class."
 DOWNLOAD_MODEL_FAILED = "Failed to download the JSON file, Server didn't respond."
 INVALID_DOWNLOADED_MODEL = "The downloaded content is not a valid JSON file."
-
+BATCH_IMPORT_INVALID_DIRECTORY = "The given directory does not exist."
 
 SKLEARN_LINEAR_MODEL_TABLE = {
     "DummyRegressor": dummy.DummyRegressor,
diff --git a/tests/test_misc_functionalities.py/test_batch.py b/tests/test_misc_functionalities.py/test_batch.py
@@ -0,0 +1,27 @@
+import os
+import re
+import random
+import numpy as np
+from pymilo import Export, Import
+from sklearn.metrics import mean_squared_error
+from sklearn.linear_model import LinearRegression
+from pymilo.utils.data_exporter import prepare_simple_regression_datasets
+
+
+def test_batch_execution():
+    x_train, y_train, x_test, _ = prepare_simple_regression_datasets()
+    linear_regression = LinearRegression()
+    linear_regression.fit(x_train, y_train)
+    pre_models = [linear_regression]*100
+    exp_n = Export.batch_export(pre_models, os.getcwd())
+    imp_n, post_models = Import.batch_import(os.getcwd())
+    r_index = random.randint(0, len(post_models) - 1)
+    pre_result = pre_models[r_index].predict(x_test)
+    post_result = post_models[r_index].predict(x_test)
+    mse = mean_squared_error(post_result, pre_result)
+    pattern = re.compile(r'model_\d+\.json')
+    for filename in os.listdir(os.getcwd()):
+        if pattern.match(filename):
+            file_path = os.path.join(os.getcwd(), filename)
+            os.remove(file_path)
+    return exp_n == imp_n and np.abs(mse) <= 10**(-8)