Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion autoBOTLib/features/features_reading_comperhension.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@ def transform(self, new_documents):
total=len(new_documents)):
for mid, method in enumerate(self.features):
value = self.features[method](doc)
new_features[mid] = value
if mid < new_features.shape[1]: # Check column bounds
new_features[enx, mid] = value

return new_features

Expand Down
9 changes: 8 additions & 1 deletion autoBOTLib/features/features_topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,14 @@ def fit(self, text_list):
docspace = self.clx.fit_transform(text_list).T
fnames = [(x, y) for x, y in self.clx.vocabulary_.items()]
fnames = [x[0] for x in sorted(fnames, key=lambda x: x[1])]
self.clustering_algo = MiniBatchKMeans(n_clusters=self.ndim)

# Ensure we don't have more clusters than samples
n_samples = docspace.shape[0]
n_clusters = min(self.ndim, n_samples - 1) if n_samples > 1 else 1
if n_clusters < 1:
n_clusters = 1

self.clustering_algo = MiniBatchKMeans(n_clusters=n_clusters)
clusters = self.clustering_algo.fit(docspace)
assert len(clusters.labels_) == docspace.shape[0]
cluster_assignments = clusters.labels_
Expand Down
53 changes: 47 additions & 6 deletions autoBOTLib/optimization/optimization_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from autoBOTLib.learning.torch_sparse_nn import torch_learners
import operator
import copy
import gc
from deap import base, creator, tools
import logging

Expand Down Expand Up @@ -573,7 +574,7 @@ def custom_initialization(self):
if self.verbose:
logging.info(pair)

weights = np.array(performances) / max(performances)
weights = np.array(performances) / max(performances) if len(performances) > 0 and max(performances) > 0 else np.ones(len(performances))
generic_individual = self.generate_random_initial_state(weights)
assert len(generic_individual) == self.weight_params
for ind in self.population:
Expand Down Expand Up @@ -617,7 +618,12 @@ def apply_weights(self,

# Copy the space as it will be subsetted.
if not custom_feature_space:
tmp_space = sparse.csr_matrix(self.train_feature_space.copy())
# Use a more memory-efficient copy approach
tmp_space = self.train_feature_space.copy()
if sparse.issparse(tmp_space):
tmp_space = sparse.csr_matrix(tmp_space)
else:
tmp_space = sparse.csr_matrix(tmp_space)

else:
tmp_space = sparse.csr_matrix(custom_feature_matrix)
Expand Down Expand Up @@ -889,11 +895,23 @@ def probability_extraction(self, pred_matrix):
zero_index = np.where(csum == 0)[0]

for j in zero_index:
prob_df.iloc[j, self.majority_class] = 1
# Ensure majority_class index is within bounds
if self.majority_class < prob_df.shape[1]:
prob_df.iloc[j, self.majority_class] = 1
else:
# Use the first column if majority_class is out of bounds
prob_df.iloc[j, 0] = 1

prob_df = prob_df.fillna(0)
assert len(np.where(prob_df.sum(axis=1) < 1)[0]) == 0

# Clean up temporary matrices
if 'prediction_matrix_final' in locals():
del prediction_matrix_final
if 'transformed_instances' in locals():
del transformed_instances
gc.collect()

return prob_df

def transform(self, instances):
Expand Down Expand Up @@ -991,6 +1009,14 @@ def predict(self, instances):
if self.verbose:
logging.info("Predictions obtained")

# Clean up temporary matrices
del transformed_instances
if 'pspace' in locals():
del pspace
if 'subsetted_space' in locals():
del subsetted_space
gc.collect()

return all_predictions

def mode_pred(self, prediction_matrix):
Expand Down Expand Up @@ -1280,6 +1306,11 @@ def instantiate_validation_env(self):
combine_with_existing_representation=self.
combine_with_existing_representation)

# Check if feature construction failed
if self.train_feature_space is None:
raise RuntimeError("Feature construction failed - unable to create feature matrix. "
"This might be due to insufficient samples or incompatible data.")

self.all_feature_names = []
if self.verbose:
logging.info("Initialized training matrix of dimension {}".format(
Expand All @@ -1294,9 +1325,8 @@ def instantiate_validation_env(self):
for transformer in self.vectorizer.named_steps[
'union'].transformer_list:
features = transformer[1].steps[1][1].get_feature_names_out()
self.feature_subspaces.append(
self.train_feature_space[:, current_fnum:(current_fnum +
len(features))])
# Store only metadata instead of the actual subspace data to save memory
# The subspace can be recreated when needed from the main feature space
current_fnum += len(features)
self.all_feature_names += list(features)
num_feat = len(features)
Expand Down Expand Up @@ -1691,4 +1721,15 @@ def evolve(self,
single_learner = (learner, individual, score)
self.ensemble_of_learners.append(single_learner)

# Clean up memory after evolution
if hasattr(self, 'population'):
del self.population
if hasattr(self, 'fitness_container'):
# Keep only the most recent fitness values, clear older ones
if len(self.fitness_container) > 10:
self.fitness_container = self.fitness_container[-10:]

# Force garbage collection to free up memory
gc.collect()

return self
4 changes: 3 additions & 1 deletion autoBOTLib/optimization/optimization_feature_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def remove_url(text, replace_token):
:return str string: A new text
"""

regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
return re.sub(regex, replace_token, text)


Expand Down Expand Up @@ -374,6 +374,7 @@ def get_simple_features(df_data, max_num_feat=10000):
except Exception as es:
print(es, "Feature construction error.")
tokenizer = None
data_matrix = None

return tokenizer, feature_names, data_matrix

Expand Down Expand Up @@ -633,4 +634,5 @@ def get_features(df_data,
print(es, "Feature construction error.")
tokenizer = None

data_matrix = None
return tokenizer, feature_names, data_matrix
48 changes: 48 additions & 0 deletions debug_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env python3
"""
Debug the specific indexing error
"""

import autoBOTLib
import pandas as pd
import traceback

def debug_test():
"""Debug the exact issue"""

print("Debug test...")
try:
dataframe = pd.read_csv("data/insults/train.tsv", sep="\t").head(50) # Even smaller
train_sequences = dataframe['text_a']
train_targets = dataframe['label']

print(f"Data shape: {len(train_sequences)}")
print(f"Targets: {set(train_targets)}")

autoBOTLibObj = autoBOTLib.GAlearner(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use neurosymbolic config @copilot for tests

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated debug_test.py to use "neurosymbolic" representation type instead of "symbolic". This provides more comprehensive feature extraction including both symbolic and neural features for better test coverage.

train_sequences,
train_targets,
representation_type="neurosymbolic",
n_fold_cv=2, # Smaller CV
sparsity=0.8, # Higher sparsity
time_constraint=0.005,
hof_size=1,
verbose=1 # Enable verbose for debugging
)

print("Training...")
autoBOTLibObj.evolve(strategy="direct-learning")

print("Testing prediction with 1 sample...")
predictions = autoBOTLibObj.predict([train_sequences.iloc[0]])
print(f"Prediction successful: {predictions}")

return True

except Exception as e:
print(f"Error: {e}")
traceback.print_exc()
return False

if __name__ == "__main__":
debug_test()
Loading
Loading