Skip to content

Commit 16ccbee

Browse files
committed
deleting bigger datasets
1 parent 6d98002 commit 16ccbee

File tree

7 files changed

+95
-2370293
lines changed

7 files changed

+95
-2370293
lines changed

01_compare_linear_models.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
"""
2+
Which linear models are optimal?
3+
"""
4+
5+
import numpy as np
6+
import pandas as pd
7+
from scipy.io import arff
8+
from sklearn.svm import SVC
9+
from sklearn.linear_model import RidgeClassifier, LogisticRegression
10+
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
11+
from sklearn.preprocessing import MinMaxScaler
12+
from sklearn.ensemble import BaggingClassifier
13+
from sklearn.pipeline import Pipeline
14+
from sklearn.calibration import CalibratedClassifierCV
15+
16+
17+
N_JOBS = 24
18+
19+
20+
database = pd.read_json("database.json").T
21+
# note: for now we will ignore those with missing values
22+
# there are very few of them
23+
# note: this meta-dataset has swallowed information about what's categorical and what isn't
24+
# which means we are just going to be using each feature as continuous even though it
25+
# may not be
26+
database = database[database.mv == 0]
27+
28+
29+
def load_data(data_name):
30+
data, meta = arff.loadarff(f"datasets/{data_name}.arff")
31+
df = pd.DataFrame(data).apply(lambda x: pd.to_numeric(x, errors="ignore"))
32+
X = pd.get_dummies(df.loc[:, df.columns != "Class"]).values
33+
unique_labels = df["Class"].unique()
34+
labels_dict = dict(zip(unique_labels, range(len(unique_labels))))
35+
df.loc[:, "Class"] = df.applymap(lambda s: labels_dict.get(s) if s in labels_dict else s)
36+
y = df["Class"].values
37+
return X, y
38+
39+
40+
def evaluate_pipeline_helper(X, y, pipeline, p_grid, random_state=0):
41+
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
42+
outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
43+
clf = GridSearchCV(
44+
estimator=pipeline, param_grid=p_grid, cv=inner_cv, scoring="roc_auc_ovr_weighted", n_jobs=N_JOBS
45+
)
46+
nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv, scoring="roc_auc_ovr_weighted", n_jobs=N_JOBS)
47+
return nested_score
48+
49+
50+
def define_and_evaluate_pipelines(X, y, random_state=0):
51+
pipeline1 = Pipeline(
52+
[("scaler", MinMaxScaler()), ("svc", SVC(kernel="linear", probability=True, random_state=random_state))]
53+
)
54+
param_grid1 = {
55+
"svc__C": np.logspace(-7, 2, 10),
56+
}
57+
58+
pipeline2 = Pipeline(
59+
[
60+
("scaler", MinMaxScaler()),
61+
("logistic", LogisticRegression(solver="saga", max_iter=10000, random_state=random_state)),
62+
]
63+
)
64+
param_grid2 = {
65+
"logistic__C": np.logspace(-7, 2, 10),
66+
}
67+
68+
pipeline3 = BaggingClassifier(
69+
Pipeline([("scaler", MinMaxScaler()), ("ridge", RidgeClassifier(solver="saga", random_state=random_state)),])
70+
)
71+
param_grid3 = {
72+
"base_estimator__ridge__alpha": np.logspace(-7, 2, 10),
73+
}
74+
75+
nested_scores1 = evaluate_pipeline_helper(X, y, pipeline1, param_grid1, random_state=random_state)
76+
nested_scores2 = evaluate_pipeline_helper(X, y, pipeline2, param_grid2, random_state=random_state)
77+
nested_scores3 = evaluate_pipeline_helper(X, y, pipeline3, param_grid3, random_state=random_state)
78+
79+
return nested_scores1, nested_scores2, nested_scores3
80+
81+
82+
# run models on all datasets
83+
results1 = []
84+
results2 = []
85+
results3 = []
86+
evaluated_datasets = []
87+
for i, dataset_name in enumerate(database.index.values):
88+
X, y = load_data(dataset_name)
89+
if len(y) > 25 and len(y) < 1000:
90+
print(i, dataset_name, len(y))
91+
nested_scores1, nested_scores2, nested_scores3 = define_and_evaluate_pipelines(X, y)
92+
results1.append(nested_scores1)
93+
results2.append(nested_scores2)
94+
results3.append(nested_scores3)
95+
evaluated_datasets.append(dataset_name)

0 commit comments

Comments
 (0)