|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +import matplotlib.pyplot as plt |
| 4 | +import seaborn as sns |
| 5 | + |
| 6 | +import math |
| 7 | + |
| 8 | +from sklearn.model_selection import GridSearchCV |
| 9 | + |
| 10 | + |
| 11 | +##################################### |
| 12 | +#### Grid Search Hyperparameters #### |
| 13 | +##################################### |
| 14 | + |
| 15 | + |
| 16 | +# Not loaded yet |
| 17 | +# If you want to release this, include this in the __init__.py file, and show an example of how to use. |
| 18 | + |
| 19 | +def plot_hyper_validation_curve(estimator, X, y, param_name, param_range |
| 20 | + ,title = "Validation Curve", xlabel = None, legend_loc='best', logX=False |
| 21 | + ,cv=None,scoring="accuracy",n_jobs=None,verbose=0, ax = None): |
| 22 | + """ |
| 23 | + * Adapted from: |
| 24 | + - https://chrisalbon.com/machine_learning/model_evaluation/plot_the_validation_curve/ |
| 25 | + - https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py |
| 26 | + |
| 27 | + """ |
| 28 | + |
| 29 | + # Calculate accuracy on training and test set using range of parameter values |
| 30 | + train_scores, test_scores = validation_curve(estimator=estimator |
| 31 | + ,X=X,y=y |
| 32 | + ,param_name=param_name |
| 33 | + ,param_range=param_range |
| 34 | + ,cv=cv |
| 35 | + ,scoring=scoring |
| 36 | + ,verbose=verbose |
| 37 | + ,n_jobs=n_jobs |
| 38 | + ) |
| 39 | + |
| 40 | + |
| 41 | + # Calculate mean and standard deviation for training set scores |
| 42 | + train_mean = np.mean(train_scores, axis=1) |
| 43 | + train_std = np.std(train_scores, axis=1) |
| 44 | + |
| 45 | + # Calculate mean and standard deviation for test set scores |
| 46 | + test_mean = np.mean(test_scores, axis=1) |
| 47 | + test_std = np.std(test_scores, axis=1) |
| 48 | + |
| 49 | + # Clean up for log X axis plotting |
| 50 | + x_label_append = "" # empty if not log scale |
| 51 | + if (logX == True): |
| 52 | + param_range = np.log10(param_range) |
| 53 | + x_label_append = " (log scale)" |
| 54 | + |
| 55 | + # Plot mean accuracy scores for training and test sets |
| 56 | + ax.plot(param_range, train_mean, label="Training Score", color="r") |
| 57 | + ax.plot(param_range, test_mean, label="Cross-Validation Score", color="g") |
| 58 | + |
| 59 | + # Plot accurancy bands for training and test sets |
| 60 | + ax.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="r",alpha=0.1) |
| 61 | + ax.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="g",alpha=0.1) |
| 62 | + |
| 63 | + # Add annotations |
| 64 | + ax.title.set_text(title) |
| 65 | + ax.set_ylabel((scoring + " Score").title()) |
| 66 | + if xlabel == None: |
| 67 | + ax.set_xlabel("Hyperparameter: " + param_name + x_label_append) |
| 68 | + else: |
| 69 | + ax.set_xlabel(xlabel + x_label_append) |
| 70 | + |
| 71 | + ax.legend(loc=legend_loc) |
| 72 | + |
| 73 | +def plot_hyper_validation_curves(estimator, X, y, param_grid, scoring |
| 74 | + ,logX=False |
| 75 | + ,cv=None, refit = 'Accuracy' |
| 76 | + ,n_jobs=None,verbose=0,arFigsize=None ): |
| 77 | + |
| 78 | + """ |
| 79 | + * Adapted from https://scikit-learn.org/stable/auto_examples/model_selection/plot_multi_metric_evaluation.html |
| 80 | + * Uses GridSearchCV to evaluate many metrics at once. Hence, if evaluating many metrics at once, |
| 81 | + this is often faster than using validation_curve which will have to be run (trained) once time for each metric) |
| 82 | + |
| 83 | + * Currently supports only one hyperparameter at a time, but can score multiple metrics against that hyperparameter. |
| 84 | + """ |
| 85 | + |
| 86 | + gs = GridSearchCV(estimator = estimator |
| 87 | + ,param_grid=param_grid |
| 88 | + ,scoring=scoring, cv=cv, refit=refit |
| 89 | + ,return_train_score=True,n_jobs=-1,verbose=verbose) |
| 90 | + gs.fit(X, y) |
| 91 | + results = gs.cv_results_ |
| 92 | + |
| 93 | + # Plotting the result |
| 94 | + |
| 95 | + num_rows = math.ceil(len(scoring)/2) |
| 96 | + if (arFigsize == None): |
| 97 | + arFigsize = (12,num_rows*4) |
| 98 | + fig, axes = plt.subplots(num_rows, 2, figsize=arFigsize,squeeze =False) |
| 99 | + |
| 100 | + # For this function, we only expect 1 key |
| 101 | + # If you have more than 1 parameter for GC, then results may not be as expected |
| 102 | + for hyper in param_grid: |
| 103 | + # Get the regular numpy array from the MaskedArray |
| 104 | + X_axis = np.array(results['param_' + hyper].data, dtype=float) |
| 105 | + |
| 106 | + x_label_append = " " |
| 107 | + if (logX == True): |
| 108 | + X_axis = np.log10(X_axis) |
| 109 | + x_label_append = " (log scale)" |
| 110 | + |
| 111 | + i = 0 |
| 112 | + for scorer in scoring: |
| 113 | + title = "Validation Curve: " + scorer.title() |
| 114 | + |
| 115 | + ax=axes[math.floor(i/2),i%2] |
| 116 | + for sample, style, color in (('train', 'o-', 'r'), ('test', 'o-', 'g')): |
| 117 | + sample_score_mean = results['mean_%s_%s' % (sample, scorer)] |
| 118 | + sample_score_std = results['std_%s_%s' % (sample, scorer)] |
| 119 | + |
| 120 | + ax.plot(X_axis, sample_score_mean, style, color=color, |
| 121 | + #alpha=1 if sample == 'test' else 1, |
| 122 | + label="%s (%s)" % (scorer, sample)) |
| 123 | + |
| 124 | + ax.fill_between(X_axis, sample_score_mean - sample_score_std, |
| 125 | + sample_score_mean + sample_score_std, |
| 126 | + alpha=0.1 if sample == 'test' else 0.1, color=color) |
| 127 | + |
| 128 | + # Add annotations |
| 129 | + ax.title.set_text(title) |
| 130 | + ax.set_ylim(-0.1, 1.1) |
| 131 | + ax.set_ylabel(scorer.title() + " Score") |
| 132 | + ax.set_xlabel("Hyperparameter: " + hyper + x_label_append) |
| 133 | + ax.grid() |
| 134 | + ax.legend(loc="best") |
| 135 | + |
| 136 | + best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0] |
| 137 | + best_score = results['mean_test_%s' % scorer][best_index] |
| 138 | + |
| 139 | + # Plot a dotted vertical line at the best score for that scorer marked by x |
| 140 | + ax.plot([X_axis[best_index], ] * 2, [0, best_score], |
| 141 | + linestyle='-.', color='black', marker='x', markeredgewidth=3, ms=8) |
| 142 | + |
| 143 | + # Annotate the best score for that scorer |
| 144 | + ax.annotate("%0.2f" % best_score, |
| 145 | + (X_axis[best_index], best_score + 0.005)) |
| 146 | + |
| 147 | + |
| 148 | + i = i+1 # Increment for each scorer |
| 149 | + #return(fig) |
| 150 | + |
| 151 | + |
| 152 | + |
0 commit comments