Update auto_sklearn.py

zuliani99 · zuliani99 · commit 95bb17a65945 · 2021-08-18T22:14:08.000+02:00
update the memory_limit with /os.cpu_count(), suggested in this issue:
_pickle.UnpicklingError: pickle data was truncated #1215
diff --git a/app/algorithms/auto_sklearn.py b/app/algorithms/auto_sklearn.py
@@ -10,17 +10,18 @@
 from termcolor import colored
 import psutil
 import time
+import os
 
 
 def make_classification(X_train, X_test, y_train, y_test, timelife, y, time_start):
   # Classification model
   automl = autosklearn.classification.AutoSklearnClassifier(
-          time_left_for_this_task=timelife*60,
-          per_run_time_limit=30,
-          memory_limit=int(int(psutil.virtual_memory().available * 1e-6) * 0.75),
-          n_jobs=-1,
-          resampling_strategy_arguments = {'cv': 10}
-    )
+    time_left_for_this_task=timelife*60,
+    per_run_time_limit=30,
+    memory_limit=int(int(psutil.virtual_memory().available * 1e-6) * 0.75)/os.cpu_count(),
+    n_jobs=-1,
+    resampling_strategy_arguments = {'cv': 10}
+  )
   automl.fit(X_train, y_train)
   y_pred = automl.predict(X_test)
   pipelines = str(pd.DataFrame(pd.Series(automl.show_models())).iloc[0].squeeze()) # Pipeline
@@ -38,12 +39,12 @@ def make_classification(X_train, X_test, y_train, y_test, timelife, y, time_star
 def make_regression(X_train, X_test, y_train, y_test, timelife, time_start):
   # Regression model
   automl = autosklearn.regression.AutoSklearnRegressor(
-          time_left_for_this_task=timelife*60,
-          per_run_time_limit=30,
-          memory_limit=int(int(psutil.virtual_memory().available * 1e-6) * 0.75),
-          n_jobs=-1,
-          resampling_strategy_arguments = {'cv': 10}
-    )
+    time_left_for_this_task=timelife*60,
+    per_run_time_limit=30,
+    memory_limit=int(int(psutil.virtual_memory().available * 1e-6) * 0.75)/os.cpu_count(),
+    n_jobs=-1,
+    resampling_strategy_arguments = {'cv': 10}
+  )
   automl.fit(X_train, y_train)
   y_pred = automl.predict(X_test)
   pipelines = str(pd.DataFrame(pd.Series(automl.show_models())).iloc[0].squeeze().split('\n')) # Pipeline