diff --git a/docs/api/features/explained_variance.rst b/docs/api/features/explained_variance.rst new file mode 100644 index 000000000..808272963 --- /dev/null +++ b/docs/api/features/explained_variance.rst @@ -0,0 +1,54 @@ +.. -*- mode: rst -*- + +Explained Variance +================== + +================= ================= +Visualizer :class:`~yellowbrick.features.explained_variance.ExplainedVariance` +Quick Method :func:`~yellowbrick.features.explained_variance.explained_variance` +Models Decomposition +Workflow Feature Engineering +================= ================= + +.. plot:: + :context: close-figs + :alt: Explained variance quick method on the credit dataset + + from yellowbrick.datasets import load_credit + from yellowbrick.features import ExplainedVariance + + # Specify the features of interest and the target + X, _ = load_credit() + + # Instantiate the visualizer, fit and transform the data + oz = ExplainedVariance() + oz.fit_transform(X) + oz.show() + + +Quick Method +------------ + +The same functionality above can be achieved with the associated quick method ``explained_variance``. This method will build the ``ExplainedVariance`` visualizer with the associated arguments, fit it, then (optionally) immediately show it. + +.. plot:: + :context: close-figs + :alt: Explained variance quick method on the concrete dataset + + from yellowbrick.datasets import load_concrete + from yellowbrick.features import explained_variance + + # Specify the features of interest and the target + X, _ = load_concrete() + + # Determine the optimal number of components + oz = explained_variance(X) + + +API Reference +------------- + +.. automodule:: yellowbrick.features.explained_variance + :members: ExplainedVariance, explained_variance + :undoc-members: + :show-inheritance: diff --git a/docs/api/features/index.rst b/docs/api/features/index.rst index 3dce89d93..1f32de088 100644 --- a/docs/api/features/index.rst +++ b/docs/api/features/index.rst @@ -47,6 +47,7 @@ finalizes and displays the image. radviz rankd pcoords - pca manifold + pca + explained_variance jointplot diff --git a/docs/changelog.rst b/docs/changelog.rst index 1f9fabf1f..20751be70 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,12 +3,23 @@ Changelog ========= +Under Development +----------------- + +Major Changes: + - New ``ExplainedVariance`` visualizer that assists in the selection of the number of + components for principal component analysis. The visualizer plots both cumulative + and discrete explained variance against the number of components and shades in + percentiles of total explained variance for visual selection of the best fit. + + + Version 1.1 ----------- * Tag: v1.1_ * Deployed Wednesday, February 12, 2020 -* Contributors: Benjamin Bengfort, Rebecca Bilbro, Kristen McIntyre, Larry Gray, Prema Roman, Adam Morris, Shivendra Sharma, Michael Chestnut, Michael Garod, Naresh Bachwani, Piyush Gautam, Daniel Navarrete, Molly Morrison, Emma Kwiecinska, Sarthak Jain, Tony Ojeda, Edwin Schmier, Nathan Danielsen +* Contributors: Benjamin Bengfort, Rebecca Bilbro, Kristen McIntyre, Larry Gray, Prema Roman, Adam Morris, Shivendra Sharma, Michael Chestnut, Michael Garod, Naresh Bachwani, Piyush Gautam, Daniel Navarrete, Molly Morrison, Emma Kwiecinska, Sarthak Jain, Tony Ojeda, Edwin Schmierer, Nathan Danielsen Major Changes: - Quick methods (aka Oneliners), which return a fully fitted finalized visualizer object in only a single line, are now implemented for all Yellowbrick Visualizers. Test coverage has been added for all quick methods. The documentation has been updated to document and demonstrate the usage of the quick methods. diff --git a/docs/index.rst b/docs/index.rst index 7ca743ad2..1ee0e8cfc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,8 +56,9 @@ Feature Visualization - :doc:`api/features/rankd`: pairwise ranking of features to detect relationships - :doc:`api/features/pcoords`: horizontal visualization of instances - :doc:`Radial Visualization `: separation of instances around a circular plot -- :doc:`api/features/pca`: projection of instances based on principal components - :doc:`api/features/manifold`: high dimensional visualization with manifold learning +- :doc:`api/features/pca`: projection of instances based on principal components +- :doc:`api/features/explained_variance`: select number of components for PCA - :doc:`Joint Plots `: direct data visualization with feature selection Classification Visualization diff --git a/tests/test_features/test_explained_variance.py b/tests/test_features/test_explained_variance.py new file mode 100644 index 000000000..acee543f7 --- /dev/null +++ b/tests/test_features/test_explained_variance.py @@ -0,0 +1,43 @@ +# tests.test_features.test_explained_variance +# Tests for the PCA explained variance visualizer +# +# Author: Benjamin Bengfort +# Created: Mon Feb 10 19:11:46 2020 -0500 +# +# Copyright (C) 2019 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_explained_variance.py [] benjamin@bengfort.com $ + +""" +Tests for the PCA explained variance visualizer +""" + +########################################################################## +## Imports +########################################################################## + +from tests.base import VisualTestCase + +from yellowbrick.datasets import load_credit +from yellowbrick.features.explained_variance import * + + +########################################################################## +## ExplainedVariance Tests +########################################################################## + +class TextExplainedVariance(VisualTestCase): + """ + Test the explained variance visualizer + """ + + def test_quick_method(self): + """ + Test the explained variance quick method + """ + X, _ = load_credit() + oz = explained_variance(X) + + assert isinstance(oz, ExplainedVariance) + self.assert_images_similar(oz) diff --git a/yellowbrick/features/__init__.py b/yellowbrick/features/__init__.py index 7b7fc0fc2..c689f8e23 100644 --- a/yellowbrick/features/__init__.py +++ b/yellowbrick/features/__init__.py @@ -24,6 +24,7 @@ from .jointplot import JointPlot, JointPlotVisualizer, joint_plot from .pca import PCA, PCADecomposition, pca_decomposition from .manifold import Manifold, manifold_embedding +from .explained_variance import ExplainedVariance, explained_variance # Alias the TargetType defined in yellowbrick.utils.target from yellowbrick.utils.target import TargetType diff --git a/yellowbrick/features/decomposition.py b/yellowbrick/features/decomposition.py deleted file mode 100644 index 5a4971911..000000000 --- a/yellowbrick/features/decomposition.py +++ /dev/null @@ -1,174 +0,0 @@ -# yellowbrick.features.decomposition -# -# Author: George Richardson -# Created: Fri Mar 2 16:16:00 2018 +0000 -# -# Copyright (C) 2016 The scikit-yb developers -# For license information, see LICENSE.txt -# -# ID: decomposition.py [0ed6e8a] g.raymond.richardson@gmail.com $ - -########################################################################## -## Imports -########################################################################## - -from yellowbrick.style import palettes -from yellowbrick.features.base import FeatureVisualizer - -from sklearn.pipeline import Pipeline -from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler - -########################################################################## -## Quick Methods -########################################################################## - - -def explained_variance_visualizer( - X, - y=None, - ax=None, - scale=True, - center=True, - colormap=palettes.DEFAULT_SEQUENCE, - **kwargs -): - """Produce a plot of the explained variance produced by a dimensionality - reduction algorithm using n=1 to n=n_components dimensions. This is a single - plot to help identify the best trade off between number of dimensions - and amount of information retained within the data. - - Parameters - ---------- - X : ndarray or DataFrame of shape n x m - A matrix of n rows with m features - - y : ndarray or Series of length n - An array or Series of target or class values - - ax : matplotlib Axes, default: None - The aces to plot the figure on - - scale : bool, default: True - Boolean that indicates if the values of X should be scaled. - - colormap : string or cmap, default: None - optional string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. - - kwargs : dict - Keyword arguments that are passed to the base class and may influence - the visualization as defined in other Visualizers. - - Returns - ------- - viz : ExplainedVariance - Returns the fitted, finalized visualizer - - Examples - -------- - >>> from sklearn import datasets - >>> bc = datasets.load_breast_cancer() - >>> X = bc = bc.data - >>> explained_variance_visualizer(X, scale=True, center=True, colormap='RdBu_r') - - """ - - # Instantiate the visualizer - visualizer = ExplainedVariance(X=X) - - # Fit and transform the visualizer (calls draw) - visualizer.fit(X, y, **kwargs) - visualizer.transform(X) - visualizer.finalize() - - # Return the visualizer object - return visualizer - - -########################################################################## -## Explained Variance Feature Visualizer -########################################################################## - - -class ExplainedVariance(FeatureVisualizer): - """ - - Parameters - ---------- - ax : matplotlib Axes, default: None - The aces to plot the figure on - - scale : bool, default: True - Boolean that indicates if the values of X should be scaled. - - colormap : string or cmap, default: None - optional string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. - - kwargs : dict - Keyword arguments that are passed to the base class and may influence - the visualization as defined in other Visualizers. - - - Examples - -------- - - >>> visualizer = ExplainedVariance() - >>> visualizer.fit(X) - >>> visualizer.transform(X) - >>> visualizer.show() - - """ - - def __init__( - self, - ax=None, - scale=True, - center=True, - n_components=None, - colormap=palettes.DEFAULT_SEQUENCE, - **kwargs - ): - - super(ExplainedVariance, self).__init__(ax=ax, **kwargs) - - self.colormap = colormap - self.n_components = n_components - self.center = center - self.scale = scale - self.pipeline = Pipeline( - [ - ("scale", StandardScaler(with_mean=self.center, with_std=self.scale)), - ("pca", PCA(n_components=self.n_components)), - ] - ) - self.pca_features = None - - @property - def explained_variance_(self): - return self.pipeline.steps[-1][1].explained_variance_ - - def fit(self, X, y=None): - self.pipeline.fit(X) - self.draw() - return self - - def transform(self, X): - self.pca_features = self.pipeline.transform(X) - return self.pca_features - - def draw(self): - X = self.explained_variance_ - self.ax.plot(X) - return self.ax - - def finalize(self, **kwargs): - # Set the title - self.set_title("Explained Variance Plot") - - # Set the axes labels - self.ax.set_ylabel("Explained Variance") - self.ax.set_xlabel("Number of Components") diff --git a/yellowbrick/features/explained_variance.py b/yellowbrick/features/explained_variance.py new file mode 100644 index 000000000..83897fd4e --- /dev/null +++ b/yellowbrick/features/explained_variance.py @@ -0,0 +1,346 @@ +# yellowbrick.features.explained_variance +# +# Author: George Richardson +# Author: Benjamin Bengfort +# Created: Fri Mar 2 16:16:00 2018 +0000 +# +# Copyright (C) 2016 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: explained_variance.py [0ed6e8a] g.raymond.richardson@gmail.com $ + +########################################################################## +## Imports +########################################################################## + +import bisect +import numpy as np + +from sklearn.pipeline import Pipeline +from sklearn.decomposition import PCA +from yellowbrick.exceptions import NotFitted +from sklearn.preprocessing import StandardScaler +from yellowbrick.features.base import FeatureVisualizer + + +########################################################################## +## Explained Variance Feature Visualizer +########################################################################## + + +class ExplainedVariance(FeatureVisualizer): + """ + + Parameters + ---------- + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in, the current axes + will be used (or generated if required). + + transformer : PCA or Pipeline, default: None + By default the visualizer creates a PCA transformer with all components, scaling + the data on request. Users can submit their own transformer or pipeline to + visualize the explained variance for, so long as the transformer or the last + step in the pipeline has ``explained_variance_`` and + ``explained_variance_ratio_`` learned attributes after being fitted. + + cumulative : bool, default: True + Display the cumulative explained variance of components ordered by magnitude, + otherwise display each component's direct value. + + ratio : bool, default: True + Display the ratio of the component's explained variance to the total variance, + otherwise display the amount of variance. + + scale : bool, default: True + If true, the default PCA used by the visualizer has a standard scalar applied + to the data using the mean and standard deviation. This argument is ignored if + a user supplied transformer exists. + + n_components : int, default: None + Whether or not to limit the number of components whose variance is explained + in the user created transformer. This argument is ignored if a user supplied + transformer exists. + + is_fitted : bool, default=False + Specify if the user supplied transformer is already fitted. If False, the + transformer will be fit when the visualizer is fit, otherwise the transformer + will not be modified. Note that if a user supplied transformer is fitted, then + no additional calls to the visualizer ``fit()`` method is required (a unique + behavior of the ``ExplainedVariance`` visualizer). + + random_state : int, RandomState instance or None, optional (default None) + Set the random state on the underlying PCA solver. Note that if a user supplied + transformer exists, this parameter is ignored. + + kwargs : dict + Keyword arguments that are passed to the base class and may influence + the visualization as defined in other Visualizers. + + Attributes + ---------- + explained_variance_ : array, shape (n_components,) + The amount of variance explained by each of the selected components. + Equal to n_components largest eigenvalues of the covariance matrix of X. + + explained_variance_ratio_ : array, shape (n_components,) + Percentage of variance explained by each of the selected components. + If n_components is not set then all components are stored and the sum of the + ratios is equal to 1.0. + + Examples + -------- + >>> visualizer = ExplainedVariance() + >>> visualizer.fit_transform(X) + >>> visualizer.show() + + Notes + ----- + This visualizer wraps (by default) a sklearn.decomposition.PCA object which may have + many other learned attributes of interest, such as ``singular_values_`` or + ``noise_variance_``. To access these properties of the fitted underlying + decomposition, use the visualizer's ``transformer`` property. + """ + + def __init__( + self, + ax=None, + transformer=None, + cumulative=True, + ratio=True, + scale=True, + n_components=None, + is_fitted=False, + random_state=None, + **kwargs + ): + # Initialize the visulizer + super(ExplainedVariance, self).__init__(ax=ax, **kwargs) + + # Set the transformer and drawing parameterws + self.cumulative = cumulative + self.ratio = ratio + self.scale = scale + self.n_components = n_components + self.is_fitted = is_fitted + self.random_state = random_state + + # NOTE: this parameter must be set last to initialize a new transformer + self.transformer = transformer + + # Keep track of internal state + self._drawn_on_fit = False + + @property + def transformer(self): + """ + Returns the underlying transformer used for explained variance. + """ + return self._transformer + + @transformer.setter + def transformer(self, transformer): + """ + Creates a PCA pipeline using scaling and number of component if None is passed + in, otherwise sets the user supplied transformer for use in the visualization. + """ + if transformer is None: + # In this case we have to fit the underlying model, so ignore user + self.is_fitted = False + + # Create either the PCA transformer if none is supplied + transformer = PCA( + n_components=self.n_components, random_state=self.random_state + ) + + # Add a standard scaler if specified + if self.scale: + transformer = Pipeline([ + ("scale", StandardScaler(with_mean=True, with_std=True)), + ("pca", transformer) + ]) + + self._transformer = transformer + + def fit(self, X, y=None): + """ + Fits the visualizer on X and transforms the data to plot it on the axes. + + Parameters + ---------- + X : array-like of shape (n, m) + A matrix or data frame with n instances and m features + + y : array-like of shape (n,), optional + A vector or series with target values for each instance in X. + Not used for ExplainedVariance but allowed here to support visual pipelines. + + Returns + ------- + self : ExplainedVariance + Returns the visualizer object. + """ + if not self.is_fitted: + self.transformer.fit(X) + + # Get the explained variance learned attributes from the transformer + self._set_explained_variance_attributes() + self.draw() + + # Prevent duplicate drawing on calls to fit_transform() + self._drawn_on_fit = True + return self + + def transform(self, X=None, y=None): + """ + Transform the data using the underlying transformer, which usually performs + dimensionality reduction on the imput features ``X``. This method can also be + called with a fitted model without passing data in order to draw the explained + variances without data. + + Parameters + ---------- + X : ndarray or DataFrame of shape n x m, optional + A matrix of n instances with m features. + + y : ndarray or Series of length n, optional + An array or series of target or class values. + Not used by the transformer, but supported to allow visual pipelines. + + Returns + ------- + Xp : ndarray or DataFrame of shape n x m + Returns a new array-like object of transformed features of shape + ``(len(X), self.n_components)``. + """ + if not self._drawn_on_fit: + # Draw on transform instead - note that this may change the attributes + self._set_explained_variance_attributes() + self.draw() + + if X is not None: + return self.transformer.transform(X) + return None + + def draw(self): + if self.ratio: + if not hasattr(self, "explained_variance_ratio_"): + raise NotFitted(( + "transformer does not have the explained_variance_ratio_, " + "use ratio=False or ensure the visualizer is fitted." + )) + X = self.explained_variance_ratio_ + + else: + if not hasattr(self, "explained_variance_"): + raise NotFitted(( + "transformer does not have the explained_variance_, " + "use ratio=True or ensure the visualizer is fitted." + )) + X = self.explained_variance_ + + label = self.transformer.__class__.__name__ + if isinstance(self.transformer, Pipeline): + label = self.transformer.steps[-1][1].__class__.__name__ + + if self.cumulative: + X = np.cumsum(X) + self.ax.plot(X, label=label) + + # TODO: allow the user to specify the cutoff amounts + for cutoff in [0.0, .50, .85, .95, .999]: + components = bisect.bisect_left(X, cutoff) + self.ax.fill_between(np.arange(0, components), 0, X[:components], color='b', alpha=min(1-cutoff+.2, 1), label="{:0.0f}%".format(cutoff*100)) + + else: + self.ax.plot(X, label=label) + + # TODO: visualize the amount of explained variance from each component + + return self.ax + + def finalize(self, **kwargs): + # Set the title + title = "Explained Variance" + if self.cumulative: + title = "Cumulative " + title + self.set_title(title) + + if self.ratio: + self.ax.set_ylabel("ratio of explained variance") + else: + self.ax.set_ylabel("explained variance") + + self.ax.set_xlabel("number of components") + self.ax.legend(loc="best", frameon=True) + + def _set_explained_variance_attributes(self): + """ + Helper function to discover the required attributes on the transformer. Does + not raise any exceptions if they cannot be found, but does not set the + attributes on the visualizer if they aren't. + """ + obj = self.transformer + if isinstance(obj, Pipeline): + obj = obj.steps[-1][1] + + for attr in ("explained_variance_", "explained_variance_ratio_"): + if hasattr(obj, attr): + setattr(self, attr, getattr(obj, attr)) + + +########################################################################## +## Quick Method +########################################################################## + +def explained_variance( + X, + y=None, + ax=None, + show=True, + **kwargs +): + """ExplainedVariance quick method. + + Parameters + ---------- + X : ndarray or DataFrame of shape n x m + A matrix of n instances with m features to determine principle components for. + + y : ndarray or Series of length n, default: None + An array or series of target or class values. This argument is not used but is + enabled for pipeline purposes. + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in, the current axes + will be used (or generated if required). + + show : bool, default: True + If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot + call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply + calls ``finalize()`` + + kwargs : dict + Keyword arguments that are passed to the base class and may influence + the visualization as defined in other Visualizers. + + Returns + ------- + viz : ExplainedVariance + Returns the fitted, finalized visualizer + """ + + # Instantiate the visualizer + oz = ExplainedVariance() + + # Fit and transform the visualizer (calls draw) + oz.fit(X, y) + oz.transform(X) + + if show: + oz.show() + else: + oz.finalize() + + # Return the visualizer object + return oz diff --git a/yellowbrick/features/pca.py b/yellowbrick/features/pca.py index 913c134ba..96a450dc5 100644 --- a/yellowbrick/features/pca.py +++ b/yellowbrick/features/pca.py @@ -471,8 +471,8 @@ def pca_decomposition( show=True, **kwargs ): + """PCA quick method. - """ Produce a two or three dimensional principal component plot of the data array ``X`` projected onto its largest sequential principal components. It is common practice to scale the data array ``X`` before applying a PC decomposition. Variable scaling @@ -557,35 +557,10 @@ def pca_decomposition( Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. - Attributes - ---------- - pca_components_ : ndarray, shape (n_features, n_components) - This tells about the magnitude of each feature in the pricipal components. - This is primarily used to draw the biplots. - - classes_ : ndarray, shape (n_classes,) - The class labels that define the discrete values in the target. Only - available if the target type is discrete. This is guaranteed to be - strings even if the classes are a different type. - - features_ : ndarray, shape (n_features,) - The names of the features discovered or used in the visualizer that - can be used as an index to access or modify data in X. If a user passes - feature names in, those features are used. Otherwise the columns of a - DataFrame are used or just simply the indices of the data array. - - range_ : (min y, max y) - A tuple that describes the minimum and maximum values in the target. - Only available if the target type is continuous. - - Examples - -------- - >>> from sklearn import datasets - >>> iris = datasets.load_iris() - >>> X = iris.data - >>> y = iris.target - >>> pca_decomposition(X, y, colors=['r', 'g', 'b'], projection=3) - + Returns + ------- + viz : PCA + Returns the fitted, finalized visualizer """ # Instantiate the visualizer visualizer = PCA(