SkBlaz · Copilot · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025
diff --git a/autoBOTLib/features/features_reading_comperhension.py b/autoBOTLib/features/features_reading_comperhension.py
@@ -216,7 +216,8 @@ def transform(self, new_documents):
                                   total=len(new_documents)):
             for mid, method in enumerate(self.features):
                 value = self.features[method](doc)
-                new_features[mid] = value
+                if mid < new_features.shape[1]:  # Check column bounds
+                    new_features[enx, mid] = value
 
         return new_features
 

diff --git a/autoBOTLib/features/features_topic.py b/autoBOTLib/features/features_topic.py
@@ -49,7 +49,14 @@ def fit(self, text_list):
         docspace = self.clx.fit_transform(text_list).T
         fnames = [(x, y) for x, y in self.clx.vocabulary_.items()]
         fnames = [x[0] for x in sorted(fnames, key=lambda x: x[1])]
-        self.clustering_algo = MiniBatchKMeans(n_clusters=self.ndim)
+
+        # Ensure we don't have more clusters than samples
+        n_samples = docspace.shape[0]
+        n_clusters = min(self.ndim, n_samples - 1) if n_samples > 1 else 1
+        if n_clusters < 1:
+            n_clusters = 1
+
+        self.clustering_algo = MiniBatchKMeans(n_clusters=n_clusters)
         clusters = self.clustering_algo.fit(docspace)
         assert len(clusters.labels_) == docspace.shape[0]
         cluster_assignments = clusters.labels_

diff --git a/autoBOTLib/optimization/optimization_engine.py b/autoBOTLib/optimization/optimization_engine.py
@@ -14,6 +14,7 @@
 from autoBOTLib.learning.torch_sparse_nn import torch_learners
 import operator
 import copy
+import gc
 from deap import base, creator, tools
 import logging
 
@@ -573,7 +574,7 @@ def custom_initialization(self):
             if self.verbose:
                 logging.info(pair)
 
-        weights = np.array(performances) / max(performances)
+        weights = np.array(performances) / max(performances) if len(performances) > 0 and max(performances) > 0 else np.ones(len(performances))
         generic_individual = self.generate_random_initial_state(weights)
         assert len(generic_individual) == self.weight_params
         for ind in self.population:
@@ -617,7 +618,12 @@ def apply_weights(self,
 
         # Copy the space as it will be subsetted.
         if not custom_feature_space:
-            tmp_space = sparse.csr_matrix(self.train_feature_space.copy())
+            # Use a more memory-efficient copy approach
+            tmp_space = self.train_feature_space.copy()
+            if sparse.issparse(tmp_space):
+                tmp_space = sparse.csr_matrix(tmp_space)
+            else:
+                tmp_space = sparse.csr_matrix(tmp_space)
 
         else:
             tmp_space = sparse.csr_matrix(custom_feature_matrix)
@@ -889,11 +895,23 @@ def probability_extraction(self, pred_matrix):
         zero_index = np.where(csum == 0)[0]
 
         for j in zero_index:
-            prob_df.iloc[j, self.majority_class] = 1
+            # Ensure majority_class index is within bounds
+            if self.majority_class < prob_df.shape[1]:
+                prob_df.iloc[j, self.majority_class] = 1
+            else:
+                # Use the first column if majority_class is out of bounds
+                prob_df.iloc[j, 0] = 1
 
         prob_df = prob_df.fillna(0)
         assert len(np.where(prob_df.sum(axis=1) < 1)[0]) == 0
 
+        # Clean up temporary matrices
+        if 'prediction_matrix_final' in locals():
+            del prediction_matrix_final
+        if 'transformed_instances' in locals():
+            del transformed_instances
+        gc.collect()
+
         return prob_df
 
     def transform(self, instances):
@@ -991,6 +1009,14 @@ def predict(self, instances):
             if self.verbose:
                 logging.info("Predictions obtained")
 
+            # Clean up temporary matrices
+            del transformed_instances
+            if 'pspace' in locals():
+                del pspace
+            if 'subsetted_space' in locals():
+                del subsetted_space
+            gc.collect()
+
             return all_predictions
 
     def mode_pred(self, prediction_matrix):
@@ -1280,6 +1306,11 @@ def instantiate_validation_env(self):
             combine_with_existing_representation=self.
             combine_with_existing_representation)
 
+        # Check if feature construction failed
+        if self.train_feature_space is None:
+            raise RuntimeError("Feature construction failed - unable to create feature matrix. "
+                             "This might be due to insufficient samples or incompatible data.")
+
         self.all_feature_names = []
         if self.verbose:
             logging.info("Initialized training matrix of dimension {}".format(
@@ -1294,9 +1325,8 @@ def instantiate_validation_env(self):
         for transformer in self.vectorizer.named_steps[
                 'union'].transformer_list:
             features = transformer[1].steps[1][1].get_feature_names_out()
-            self.feature_subspaces.append(
-                self.train_feature_space[:, current_fnum:(current_fnum +
-                                                          len(features))])
+            # Store only metadata instead of the actual subspace data to save memory
+            # The subspace can be recreated when needed from the main feature space
             current_fnum += len(features)
             self.all_feature_names += list(features)
             num_feat = len(features)
@@ -1691,4 +1721,15 @@ def evolve(self,
                 single_learner = (learner, individual, score)
                 self.ensemble_of_learners.append(single_learner)
 
+        # Clean up memory after evolution
+        if hasattr(self, 'population'):
+            del self.population
+        if hasattr(self, 'fitness_container'):
+            # Keep only the most recent fitness values, clear older ones
+            if len(self.fitness_container) > 10:
+                self.fitness_container = self.fitness_container[-10:]
+
+        # Force garbage collection to free up memory
+        gc.collect()
+
         return self
diff --git a/autoBOTLib/optimization/optimization_feature_constructors.py b/autoBOTLib/optimization/optimization_feature_constructors.py
@@ -151,7 +151,7 @@ def remove_url(text, replace_token):
     :return str string: A new text
     """
 
-    regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
+    regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
     return re.sub(regex, replace_token, text)
 
 
@@ -374,6 +374,7 @@ def get_simple_features(df_data, max_num_feat=10000):
     except Exception as es:
         print(es, "Feature construction error.")
         tokenizer = None
+        data_matrix = None
 
     return tokenizer, feature_names, data_matrix
 
@@ -633,4 +634,5 @@ def get_features(df_data,
         print(es, "Feature construction error.")
         tokenizer = None
 
+        data_matrix = None
     return tokenizer, feature_names, data_matrix
diff --git a/debug_test.py b/debug_test.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""
+Debug the specific indexing error
+"""
+
+import autoBOTLib
+import pandas as pd
+import traceback
+
+def debug_test():
+    """Debug the exact issue"""
+
+    print("Debug test...")
+    try:
+        dataframe = pd.read_csv("data/insults/train.tsv", sep="\t").head(50)  # Even smaller
+        train_sequences = dataframe['text_a']
+        train_targets = dataframe['label']
+
+        print(f"Data shape: {len(train_sequences)}")
+        print(f"Targets: {set(train_targets)}")
+
+        autoBOTLibObj = autoBOTLib.GAlearner(
+            train_sequences,
+            train_targets,
+            representation_type="neurosymbolic",
+            n_fold_cv=2,  # Smaller CV
+            sparsity=0.8,  # Higher sparsity
+            time_constraint=0.005,
+            hof_size=1,
+            verbose=1  # Enable verbose for debugging
+        )
+
+        print("Training...")
+        autoBOTLibObj.evolve(strategy="direct-learning")
+
+        print("Testing prediction with 1 sample...")
+        predictions = autoBOTLibObj.predict([train_sequences.iloc[0]])
+        print(f"Prediction successful: {predictions}")
+
+        return True
+
+    except Exception as e:
+        print(f"Error: {e}")
+        traceback.print_exc()
+        return False
+
+if __name__ == "__main__":
+    debug_test()