Skip to content

Commit 77b063e

Browse files
committed
0.0.1b6 released with basic scikit-learn helper functions
1 parent 939eacc commit 77b063e

34 files changed

+3678
-209
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ This is a helper package for a variety of functions as described in the Overview
44

55
# Installation
66

7-
pip install more==0.0.1b5
7+
pip install more==0.0.1b6
88

99
# Overview
1010

1111
This is a helper package for a variety of functions
1212
1. Extension for Pandas Dataframe (Beta version released)
1313
2. Extension for Visualization (Beta version released)
14-
3. Extension for Scikit-learn (TBD)
14+
3. Extension for Scikit-learn (Beta version released)
1515

1616
# Examples
1717
Check out the [examples](https://github.com/ngupta23/more/tree/master/examples) folder for details on usage
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
name = "Scikit Helper"
1+
from .plot_learning_curves import plot_learning_curves, plot_learning_curve
2+
from .plot_hyper_validation_curves import plot_hyper_validation_curves
3+
from .train_classifier import train_classifier
4+
from .plot_classification import print_classification_details, plot_classification_report
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# For Time Logging
2+
import time
3+
from contextlib import contextmanager
4+
import logging
5+
6+
@contextmanager
7+
# Timing Function
8+
def time_usage(name=""):
9+
"""
10+
log the time usage in a code block
11+
"""
12+
#print ("In time_usage runID = {}".format(runID))
13+
start = time.time()
14+
yield
15+
end = time.time()
16+
elapsed_seconds = float("%.10f" % (end - start))
17+
logging.info('%s: Time Taken (seconds): %s', name, elapsed_seconds)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import numpy as np
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
5+
from sklearn import metrics as mt
6+
7+
8+
def print_classification_details(actual,predicted,verbose=False):
9+
# print the accuracy and confusion matrix
10+
cm = mt.confusion_matrix(actual,predicted)
11+
cr = mt.classification_report(actual,predicted)
12+
13+
print("confusion matrix\n", cm)
14+
print(cr)
15+
16+
if (verbose == True):
17+
plot_classification_report(cr)
18+
19+
20+
def plot_classification_report(cr, title=None, cmap='RdBu'):
21+
"""
22+
Adapted from https://medium.com/district-data-labs/visual-diagnostics-for-more-informed-machine-learning-7ec92960c96b
23+
"""
24+
title = title or 'Classification report'
25+
lines = cr.split('\n')
26+
classes = []
27+
matrix = []
28+
29+
for line in lines[2:(len(lines)-5)]:
30+
s = line.split()
31+
classes.append(s[0])
32+
value = [float(x) for x in s[1: len(s) - 1]]
33+
matrix.append(value)
34+
35+
fig, ax = plt.subplots(1)
36+
37+
for column in range(len(matrix)+1):
38+
for row in range(len(classes)):
39+
txt = matrix[row][column]
40+
#ax.text(column,row,matrix[row][column],va='center',ha='center')
41+
ax.text(column,row,txt,va='center',ha='center',size="x-large",bbox=dict(facecolor='white', alpha=0.5))
42+
43+
fig = plt.imshow(matrix, interpolation='nearest', cmap=cmap, vmin=0, vmax=1)
44+
plt.title(title)
45+
plt.colorbar()
46+
x_tick_marks = np.arange(len(classes)+1)
47+
y_tick_marks = np.arange(len(classes))
48+
plt.xticks(x_tick_marks, ['Precision', 'Recall', 'F1-score'], rotation=45)
49+
plt.yticks(y_tick_marks, classes)
50+
plt.ylabel('Classes')
51+
plt.xlabel('Measures')
52+
plt.show()
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import numpy as np
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
import seaborn as sns
5+
6+
import math
7+
8+
from sklearn.model_selection import GridSearchCV
9+
10+
11+
#####################################
12+
#### Grid Search Hyperparameters ####
13+
#####################################
14+
15+
16+
# Not loaded yet
17+
# If you want to release this, include this in the __init__.py file, and show an example of how to use.
18+
19+
def plot_hyper_validation_curve(estimator, X, y, param_name, param_range
20+
,title = "Validation Curve", xlabel = None, legend_loc='best', logX=False
21+
,cv=None,scoring="accuracy",n_jobs=None,verbose=0, ax = None):
22+
"""
23+
* Adapted from:
24+
- https://chrisalbon.com/machine_learning/model_evaluation/plot_the_validation_curve/
25+
- https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py
26+
27+
"""
28+
29+
# Calculate accuracy on training and test set using range of parameter values
30+
train_scores, test_scores = validation_curve(estimator=estimator
31+
,X=X,y=y
32+
,param_name=param_name
33+
,param_range=param_range
34+
,cv=cv
35+
,scoring=scoring
36+
,verbose=verbose
37+
,n_jobs=n_jobs
38+
)
39+
40+
41+
# Calculate mean and standard deviation for training set scores
42+
train_mean = np.mean(train_scores, axis=1)
43+
train_std = np.std(train_scores, axis=1)
44+
45+
# Calculate mean and standard deviation for test set scores
46+
test_mean = np.mean(test_scores, axis=1)
47+
test_std = np.std(test_scores, axis=1)
48+
49+
# Clean up for log X axis plotting
50+
x_label_append = "" # empty if not log scale
51+
if (logX == True):
52+
param_range = np.log10(param_range)
53+
x_label_append = " (log scale)"
54+
55+
# Plot mean accuracy scores for training and test sets
56+
ax.plot(param_range, train_mean, label="Training Score", color="r")
57+
ax.plot(param_range, test_mean, label="Cross-Validation Score", color="g")
58+
59+
# Plot accurancy bands for training and test sets
60+
ax.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="r",alpha=0.1)
61+
ax.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="g",alpha=0.1)
62+
63+
# Add annotations
64+
ax.title.set_text(title)
65+
ax.set_ylabel((scoring + " Score").title())
66+
if xlabel == None:
67+
ax.set_xlabel("Hyperparameter: " + param_name + x_label_append)
68+
else:
69+
ax.set_xlabel(xlabel + x_label_append)
70+
71+
ax.legend(loc=legend_loc)
72+
73+
def plot_hyper_validation_curves(estimator, X, y, param_grid, scoring
74+
,logX=False
75+
,cv=None, refit = 'Accuracy'
76+
,n_jobs=None,verbose=0,arFigsize=None ):
77+
78+
"""
79+
* Adapted from https://scikit-learn.org/stable/auto_examples/model_selection/plot_multi_metric_evaluation.html
80+
* Uses GridSearchCV to evaluate many metrics at once. Hence, if evaluating many metrics at once,
81+
this is often faster than using validation_curve which will have to be run (trained) once time for each metric)
82+
83+
* Currently supports only one hyperparameter at a time, but can score multiple metrics against that hyperparameter.
84+
"""
85+
86+
gs = GridSearchCV(estimator = estimator
87+
,param_grid=param_grid
88+
,scoring=scoring, cv=cv, refit=refit
89+
,return_train_score=True,n_jobs=-1,verbose=verbose)
90+
gs.fit(X, y)
91+
results = gs.cv_results_
92+
93+
# Plotting the result
94+
95+
num_rows = math.ceil(len(scoring)/2)
96+
if (arFigsize == None):
97+
arFigsize = (12,num_rows*4)
98+
fig, axes = plt.subplots(num_rows, 2, figsize=arFigsize,squeeze =False)
99+
100+
# For this function, we only expect 1 key
101+
# If you have more than 1 parameter for GC, then results may not be as expected
102+
for hyper in param_grid:
103+
# Get the regular numpy array from the MaskedArray
104+
X_axis = np.array(results['param_' + hyper].data, dtype=float)
105+
106+
x_label_append = " "
107+
if (logX == True):
108+
X_axis = np.log10(X_axis)
109+
x_label_append = " (log scale)"
110+
111+
i = 0
112+
for scorer in scoring:
113+
title = "Validation Curve: " + scorer.title()
114+
115+
ax=axes[math.floor(i/2),i%2]
116+
for sample, style, color in (('train', 'o-', 'r'), ('test', 'o-', 'g')):
117+
sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
118+
sample_score_std = results['std_%s_%s' % (sample, scorer)]
119+
120+
ax.plot(X_axis, sample_score_mean, style, color=color,
121+
#alpha=1 if sample == 'test' else 1,
122+
label="%s (%s)" % (scorer, sample))
123+
124+
ax.fill_between(X_axis, sample_score_mean - sample_score_std,
125+
sample_score_mean + sample_score_std,
126+
alpha=0.1 if sample == 'test' else 0.1, color=color)
127+
128+
# Add annotations
129+
ax.title.set_text(title)
130+
ax.set_ylim(-0.1, 1.1)
131+
ax.set_ylabel(scorer.title() + " Score")
132+
ax.set_xlabel("Hyperparameter: " + hyper + x_label_append)
133+
ax.grid()
134+
ax.legend(loc="best")
135+
136+
best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
137+
best_score = results['mean_test_%s' % scorer][best_index]
138+
139+
# Plot a dotted vertical line at the best score for that scorer marked by x
140+
ax.plot([X_axis[best_index], ] * 2, [0, best_score],
141+
linestyle='-.', color='black', marker='x', markeredgewidth=3, ms=8)
142+
143+
# Annotate the best score for that scorer
144+
ax.annotate("%0.2f" % best_score,
145+
(X_axis[best_index], best_score + 0.005))
146+
147+
148+
i = i+1 # Increment for each scorer
149+
#return(fig)
150+
151+
152+
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import numpy as np
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
import seaborn as sns
5+
import math
6+
7+
from sklearn.model_selection import learning_curve
8+
from .common import time_usage
9+
10+
#########################
11+
#### Learning Curves ####
12+
#########################
13+
14+
15+
def plot_learning_curve(estimator, title_suffix, X, y, scoring="accuracy", ylim=None, cv=None,
16+
n_jobs=None, train_sizes=np.linspace(.2, 1.0, 5),verbose=1, ax = None):
17+
"""
18+
NOTE: Adopted from: https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html,
19+
but allows plotting multiple metrics at a time when called from within plot_learning_curves (note the s at the end)
20+
21+
Generate a simple plot of the test and training learning curve.
22+
23+
Parameters
24+
----------
25+
estimator : object type that implements the "fit" and "predict" methods
26+
An object of that type which is cloned for each validation.
27+
28+
title : string
29+
Title for the chart.
30+
31+
X : array-like, shape (n_samples, n_features)
32+
Training vector, where n_samples is the number of samples and
33+
n_features is the number of features.
34+
35+
y : array-like, shape (n_samples) or (n_samples, n_features), optional
36+
Target relative to X for classification or regression;
37+
None for unsupervised learning.
38+
39+
ylim : tuple, shape (ymin, ymax), optional
40+
Defines minimum and maximum yvalues plotted.
41+
42+
cv : int, cross-validation generator or an iterable, optional
43+
Determines the cross-validation splitting strategy.
44+
Possible inputs for cv are:
45+
- None, to use the default 3-fold cross-validation,
46+
- integer, to specify the number of folds.
47+
- :term:`CV splitter`,
48+
- An iterable yielding (train, test) splits as arrays of indices.
49+
50+
For integer/None inputs, if ``y`` is binary or multiclass,
51+
:class:`StratifiedKFold` used. If the estimator is not a classifier
52+
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
53+
54+
Refer :ref:`User Guide <cross_validation>` for the various
55+
cross-validators that can be used here.
56+
57+
n_jobs : int or None, optional (default=None)
58+
Number of jobs to run in parallel.
59+
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
60+
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
61+
for more details.
62+
63+
train_sizes : array-like, shape (n_ticks,), dtype float or int
64+
Relative or absolute numbers of training examples that will be used to
65+
generate the learning curve. If the dtype is float, it is regarded as a
66+
fraction of the maximum size of the training set (that is determined
67+
by the selected validation method), i.e. it has to be within (0, 1].
68+
Otherwise it is interpreted as absolute sizes of the training sets.
69+
Note that for classification the number of samples usually have to
70+
be big enough to contain at least one sample from each class.
71+
(default: np.linspace(0.1, 1.0, 5))
72+
"""
73+
74+
75+
train_sizes, train_scores, test_scores = learning_curve(estimator
76+
,X,y
77+
,cv=cv
78+
,scoring=scoring
79+
,n_jobs=n_jobs
80+
,train_sizes=train_sizes
81+
,verbose=verbose
82+
)
83+
train_scores_mean = np.mean(train_scores, axis=1)
84+
train_scores_std = np.std(train_scores, axis=1)
85+
test_scores_mean = np.mean(test_scores, axis=1)
86+
test_scores_std = np.std(test_scores, axis=1)
87+
88+
89+
ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
90+
train_scores_mean + train_scores_std, alpha=0.1,
91+
color="r")
92+
ax.fill_between(train_sizes, test_scores_mean - test_scores_std,
93+
test_scores_mean + test_scores_std, alpha=0.1, color="g")
94+
ax.plot(train_sizes, train_scores_mean, 'o-', color="r",
95+
label="Training score")
96+
ax.plot(train_sizes, test_scores_mean, 'o-', color="g",
97+
label="Cross-validation score")
98+
99+
## Annotate Plots
100+
ax.title.set_text("Learning Curve: " + title_suffix)
101+
if (ylim == None):
102+
ax.set_ylim(-0.1, 1.1)
103+
if ylim is not None:
104+
ax.set_ylim(*ylim)
105+
106+
ax.set_ylabel(title_suffix + " Score")
107+
ax.set_xlabel("Training examples")
108+
ax.grid(b=True)
109+
ax.legend(loc="best")
110+
#return plt
111+
112+
113+
def plot_learning_curves(estimator,X,y,scoring,cv=None,n_jobs=None,verbose=1,arFigsize=None):
114+
num_rows = math.ceil(len(scoring)/2)
115+
if (arFigsize == None):
116+
arFigsize = (12,num_rows*4)
117+
118+
fig, axes = plt.subplots(num_rows, 2, figsize=arFigsize, squeeze=False)
119+
i = 0
120+
for scorer in scoring.keys():
121+
with time_usage(" Learning Curve | " + scorer):
122+
plot_learning_curve(estimator=estimator
123+
,title_suffix = scorer.title()
124+
,X=X, y=y
125+
,scoring=scoring[scorer]
126+
,cv=cv
127+
,n_jobs=n_jobs
128+
,verbose=verbose
129+
,ax=axes[math.floor(i/2),i%2]
130+
)
131+
i = i+1
132+
133+
#plt.show()

0 commit comments

Comments
 (0)