Skip to content

Commit 607f9f3

Browse files
author
Sergey Feldman
committed
fixes to linear model run
1 parent 50d3603 commit 607f9f3

File tree

1 file changed

+31
-22
lines changed

1 file changed

+31
-22
lines changed

01_compare_linear_models.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import numpy as np
66
import pandas as pd
7+
import os
78
from scipy.io import arff
89
from sklearn.svm import SVC
910
from sklearn.linear_model import RidgeClassifier, LogisticRegression
@@ -24,18 +25,21 @@
2425
# which means we are just going to be using each feature as continuous even though it
2526
# may not be
2627
database = database[database.mv == 0]
27-
database = database[databaes.nrow >= 50]
28+
database = database[database.nrow >= 50]
2829

2930

3031
def load_data(data_name):
31-
data, meta = arff.loadarff(f"datasets/{data_name}.arff")
32-
df = pd.DataFrame(data).apply(lambda x: pd.to_numeric(x, errors="ignore"))
33-
X = pd.get_dummies(df.loc[:, df.columns != "Class"]).values
34-
unique_labels = df["Class"].unique()
35-
labels_dict = dict(zip(unique_labels, range(len(unique_labels))))
36-
df.loc[:, "Class"] = df.applymap(lambda s: labels_dict.get(s) if s in labels_dict else s)
37-
y = df["Class"].values
38-
return X, y
32+
file_path = f"datasets/{data_name}.arff"
33+
if os.path.exists(file_path):
34+
data, meta = arff.loadarff(file_path)
35+
df = pd.DataFrame(data).apply(lambda x: pd.to_numeric(x, errors="ignore"))
36+
X = pd.get_dummies(df.loc[:, df.columns != "Class"]).values
37+
unique_labels = df["Class"].unique()
38+
labels_dict = dict(zip(unique_labels, range(len(unique_labels))))
39+
df.loc[:, "Class"] = df.applymap(lambda s: labels_dict.get(s) if s in labels_dict else s)
40+
y = df["Class"].values
41+
return X, y
42+
return [], []
3943

4044

4145
def evaluate_pipeline_helper(X, y, pipeline, p_grid, random_state=0):
@@ -49,13 +53,15 @@ def evaluate_pipeline_helper(X, y, pipeline, p_grid, random_state=0):
4953

5054

5155
def define_and_evaluate_pipelines(X, y, random_state=0):
56+
# SVC
5257
pipeline1 = Pipeline(
5358
[("scaler", MinMaxScaler()), ("svc", SVC(kernel="linear", probability=True, random_state=random_state))]
5459
)
5560
param_grid1 = {
5661
"svc__C": np.logspace(-7, 2, 10),
5762
}
5863

64+
# logistic regression
5965
pipeline2 = Pipeline(
6066
[
6167
("scaler", MinMaxScaler()),
@@ -66,8 +72,9 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
6672
"logistic__C": np.logspace(-7, 2, 10),
6773
}
6874

75+
# ridge has no predict_proba, but can become probabalistic with bagging classifier
6976
pipeline3 = BaggingClassifier(
70-
Pipeline([("scaler", MinMaxScaler()), ("ridge", RidgeClassifier(solver="saga", random_state=random_state)),])
77+
Pipeline([("scaler", MinMaxScaler()), ("ridge", RidgeClassifier(random_state=random_state)),])
7178
)
7279
param_grid3 = {
7380
"base_estimator__ridge__alpha": np.logspace(-7, 2, 10),
@@ -87,15 +94,17 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
8794
evaluated_datasets = []
8895
for i, dataset_name in enumerate(database.index.values):
8996
X, y = load_data(dataset_name)
90-
numpy.random.seed(0)
91-
if len(y) > 10000:
92-
# subset to 10000
93-
random_idx = np.random.choice(len(y), 10000 replace=False)
94-
X = X[random_idx, :]
95-
y = y[random_idx]
96-
print(i, dataset_name, len(y))
97-
nested_scores1, nested_scores2, nested_scores3 = define_and_evaluate_pipelines(X, y)
98-
results1.append(nested_scores1)
99-
results2.append(nested_scores2)
100-
results3.append(nested_scores3)
101-
evaluated_datasets.append(dataset_name)
97+
# datasets might have too few samples overall or per class
98+
if len(y) > 50 and len(pd.value_counts(y) > 16):
99+
np.random.seed(0)
100+
if len(y) > 10000:
101+
# subset to 10000
102+
random_idx = np.random.choice(len(y), 10000, replace=False)
103+
X = X[random_idx, :]
104+
y = y[random_idx]
105+
print(i, dataset_name, len(y))
106+
nested_scores1, nested_scores2, nested_scores3 = define_and_evaluate_pipelines(X, y)
107+
results1.append(nested_scores1)
108+
results2.append(nested_scores2)
109+
results3.append(nested_scores3)
110+
evaluated_datasets.append(dataset_name)

0 commit comments

Comments
 (0)