-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgradient_boost_classifier.py
More file actions
94 lines (81 loc) · 4.31 KB
/
gradient_boost_classifier.py
File metadata and controls
94 lines (81 loc) · 4.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
def train_gradient_boost():
clauses = ["300","500","1000","1500","1800"]
examples = ["100","1000","5000"]
#hyperparams
criterions = ["friedman_mse", "squared_error"]
max_depths = {
"100": [2,4,8,10],
"1000": [5,10,15,20],
"5000": [5,10,20,30],
}
n_estimators = [10,20,50]
learning_rates = [0.01, 0.05, 0.1]
subsamples = [0.5, 1.0]
report = {}
def read_data(file_path):
if os.path.isfile(file_path):
pdf = pd.read_csv(file_path, header=None)
pdf_x = pdf.iloc[:,:-1]
pdf_y = pdf.iloc[:,-1]
return pdf_x, pdf_y
def generate_report(c,e,best_crit,best_depth,best_n_estimator,best_lr, best_subsample,final_accuracy,final_f1):
report[(c,e)] = [best_crit,best_depth,best_n_estimator,best_lr, best_subsample,final_accuracy,final_f1]
for c in clauses:
for e in examples:
train_path = f"./all_data/train_c{c}_d{e}.csv"
validate_path = f"./all_data/valid_c{c}_d{e}.csv"
test_path = f"./all_data/test_c{c}_d{e}.csv"
# read data
train_data_x, train_data_y = read_data(train_path)
validate_data_x, validate_data_y = read_data(validate_path)
test_data_x, test_data_y = read_data(test_path)
# train with train data and use validate to tune
print("======================")
print(f"Training Dataset: clauses: {c}, examples: {e}")
best_crit, best_depth, best_n_estimator, best_lr, best_subsample = None, None, None, None, None
max_acc = float('-inf')
for crit in criterions:
for depth in max_depths[e]:
for number_trees in n_estimators:
for lr in learning_rates:
for subsample in subsamples:
print(f"Hypertuning for criterion: {crit}, depth: {depth}, n_estimator: {number_trees}, learning_rate: {lr}, subsample: {subsample}")
gradient = GradientBoostingClassifier(
criterion=crit,
max_depth=depth,
n_estimators=number_trees,
learning_rate=lr,
subsample=subsample,
random_state=42
)
gradient.fit(train_data_x, train_data_y)
predictions = gradient.predict(validate_data_x)
accuracy = accuracy_score(validate_data_y, predictions)
# update best hyper params
if accuracy > max_acc:
max_acc = accuracy
best_crit, best_depth, best_n_estimator, best_lr, best_subsample = crit, depth, number_trees, lr, subsample
# Combine train and validate and retrain
print(f"Best Params: criterion: {best_crit}, depth: {best_depth}, n_estimator: {best_n_estimator}, best_lr: {best_lr}, best_subsample: {best_subsample}, accuracy: {max_acc}")
train_valid_x = pd.concat([train_data_x, validate_data_x] ,ignore_index=True)
train_valid_y = pd.concat([train_data_y, validate_data_y] ,ignore_index=True)
gradient = GradientBoostingClassifier(
criterion=best_crit,
max_depth=best_depth,
n_estimators=best_n_estimator,
learning_rate=best_lr,
subsample=best_subsample,
random_state=42
)
gradient.fit(train_valid_x, train_valid_y)
final_predictions = gradient.predict(test_data_x)
# Eval Metrics
final_accuracy = accuracy_score(test_data_y, final_predictions)
final_f1 = f1_score(test_data_y, final_predictions)
print(f"Final Evaluations: accuracy:{final_accuracy}, f1: {final_f1}")
generate_report(c,e,best_crit,best_depth,best_n_estimator,best_lr, best_subsample,final_accuracy,final_f1)
return report