Skip to content

Commit f21a78e

Browse files
author
Sergey Feldman
committed
lightgbm wrapup
1 parent 6adcbb7 commit f21a78e

8 files changed

+297
-482
lines changed

01_compare_baseline_models.py

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
"""
2-
Which linear models are optimal?
2+
Which baseline models are best?
33
"""
44

5-
import os
65
import time
76
import pickle
87
import numpy as np
98
import pandas as pd
10-
from scipy.io import arff
119
from sklearn.svm import SVC
1210
from sklearn.linear_model import LogisticRegression
1311
from sklearn.ensemble import RandomForestClassifier
@@ -16,6 +14,7 @@
1614
from sklearn.ensemble import BaggingClassifier
1715
from sklearn.pipeline import Pipeline
1816
from sklearn.svm import SVC
17+
from utils import load_data
1918

2019

2120
N_JOBS = 4 * 4 * 9
@@ -28,20 +27,6 @@
2827
database = database[database.nrow >= 50]
2928

3029

31-
def load_data(data_name):
32-
file_path = f"datasets/{data_name}.arff"
33-
if os.path.exists(file_path):
34-
data, meta = arff.loadarff(file_path)
35-
df = pd.DataFrame(data).apply(lambda x: pd.to_numeric(x, errors="ignore"))
36-
X = pd.get_dummies(df.loc[:, df.columns != "Class"]).values
37-
unique_labels = df["Class"].unique()
38-
labels_dict = dict(zip(unique_labels, range(len(unique_labels))))
39-
df.loc[:, "Class"] = df.applymap(lambda s: labels_dict.get(s) if s in labels_dict else s)
40-
y = df["Class"].values
41-
return X, y
42-
return [], []
43-
44-
4530
def evaluate_pipeline_helper(X, y, pipeline, param_grid, random_state=0):
4631
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
4732
outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
@@ -58,7 +43,7 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
5843
[("scaler", MinMaxScaler()), ("svc", SVC(kernel="linear", probability=True, random_state=random_state))]
5944
)
6045
param_grid1 = {
61-
"svc__C": [1e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 1e1, 1e2],
46+
"svc__C": [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2],
6247
}
6348

6449
# logistic regression
@@ -69,7 +54,7 @@ def define_and_evaluate_pipelines(X, y, random_state=0):
6954
]
7055
)
7156
param_grid2 = {
72-
"logistic__C": [1e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 1e1, 1e2],
57+
"logistic__C": [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2],
7358
}
7459

7560
# random forest

0 commit comments

Comments
 (0)