diff --git a/doc/changelog.rst b/doc/changelog.rst index ecd49138c5..7b28be6259 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -22,6 +22,9 @@ New Features makes it easy to change the ordering of a discrete variable according to some other variable/column. +- :class:`~plotnine.stats.stat_smooth` can now use formulae for linear + models. + Bug Fixes ********* diff --git a/doc/conf.py b/doc/conf.py index 6ef5b32135..24866722b2 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -410,7 +410,8 @@ 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), 'sklearn': ('http://scikit-learn.org/stable/', None), 'skmisc': ('https://has2k1.github.io/scikit-misc/', None), - 'adjustText': ('https://adjusttext.readthedocs.io/en/latest/', None) + 'adjustText': ('https://adjusttext.readthedocs.io/en/latest/', None), + 'patsy': ('https://patsy.readthedocs.io/en/stable', None) } diff --git a/plotnine/stats/smoothers.py b/plotnine/stats/smoothers.py index 46bcb53a80..b2c8aabab5 100644 --- a/plotnine/stats/smoothers.py +++ b/plotnine/stats/smoothers.py @@ -5,6 +5,8 @@ import pandas as pd import scipy.stats as stats import statsmodels.api as sm +import statsmodels.formula.api as smf +from patsy import dmatrices from ..exceptions import PlotnineError, PlotnineWarning from ..utils import get_valid_kwargs @@ -47,17 +49,25 @@ def lm(data, xseq, **params): """ Fit OLS / WLS if data has weight """ + if params['formula']: + return lm_formula(data, xseq, **params) + X = sm.add_constant(data['x']) Xseq = sm.add_constant(xseq) + weights = data.get('weights', None) - if 'weight' in data: - init_kwargs, fit_kwargs = separate_method_kwargs( - params['method_args'], sm.WLS, sm.WLS.fit) - model = sm.WLS(data['y'], X, weights=data['weight'], **init_kwargs) - else: + if weights is None: init_kwargs, fit_kwargs = separate_method_kwargs( params['method_args'], sm.OLS, sm.OLS.fit) model = sm.OLS(data['y'], X, **init_kwargs) + else: + if np.any(weights < 0): + raise ValueError( + "All weights must be greater than zero." + ) + init_kwargs, fit_kwargs = separate_method_kwargs( + params['method_args'], sm.WLS, sm.WLS.fit) + model = sm.WLS(data['y'], X, weights=data['weight'], **init_kwargs) results = model.fit(**fit_kwargs) data = pd.DataFrame({'x': xseq}) @@ -74,10 +84,60 @@ def lm(data, xseq, **params): return data +def lm_formula(data, xseq, **params): + """ + Fit OLS / WLS using a formula + """ + formula = params['formula'] + eval_env = params['enviroment'] + weights = data.get('weight', None) + + if weights is None: + init_kwargs, fit_kwargs = separate_method_kwargs( + params['method_args'], sm.OLS, sm.OLS.fit) + model = smf.ols( + formula, + data, + eval_env=eval_env, + **init_kwargs + ) + else: + if np.any(weights < 0): + raise ValueError( + "All weights must be greater than zero." + ) + init_kwargs, fit_kwargs = separate_method_kwargs( + params['method_args'], sm.OLS, sm.OLS.fit) + model = smf.wls( + formula, + data, + weights=weights, + eval_env=eval_env, + **init_kwargs + ) + + results = model.fit(**fit_kwargs) + data = pd.DataFrame({'x': xseq}) + data['y'] = results.predict(data) + + if params['se']: + _, predictors = dmatrices(formula, data, eval_env=eval_env) + alpha = 1 - params['level'] + prstd, iv_l, iv_u = wls_prediction_std( + results, predictors, alpha=alpha) + data['se'] = prstd + data['ymin'] = iv_l + data['ymax'] = iv_u + return data + + def rlm(data, xseq, **params): """ Fit RLM """ + if params['formula']: + return rlm_formula(data, xseq, **params) + X = sm.add_constant(data['x']) Xseq = sm.add_constant(xseq) @@ -96,10 +156,38 @@ def rlm(data, xseq, **params): return data +def rlm_formula(data, xseq, **params): + """ + Fit RLM using a formula + """ + eval_env = params['enviroment'] + formula = params['formula'] + init_kwargs, fit_kwargs = separate_method_kwargs( + params['method_args'], sm.RLM, sm.RLM.fit) + model = smf.rlm( + formula, + data, + eval_env=eval_env, + **init_kwargs + ) + results = model.fit(**fit_kwargs) + data = pd.DataFrame({'x': xseq}) + data['y'] = results.predict(data) + + if params['se']: + warnings.warn("Confidence intervals are not yet implemented" + "for RLM smoothing.", PlotnineWarning) + + return data + + def gls(data, xseq, **params): """ Fit GLS """ + if params['formula']: + return gls_formula(data, xseq, **params) + X = sm.add_constant(data['x']) Xseq = sm.add_constant(xseq) @@ -122,10 +210,42 @@ def gls(data, xseq, **params): return data +def gls_formula(data, xseq, **params): + """ + Fit GLL using a formula + """ + eval_env = params['enviroment'] + formula = params['formula'] + init_kwargs, fit_kwargs = separate_method_kwargs( + params['method_args'], sm.GLS, sm.GLS.fit) + model = smf.gls( + formula, + data, + eval_env=eval_env, + **init_kwargs + ) + results = model.fit(**fit_kwargs) + data = pd.DataFrame({'x': xseq}) + data['y'] = results.predict(data) + + if params['se']: + _, predictors = dmatrices(formula, data, eval_env=eval_env) + alpha = 1 - params['level'] + prstd, iv_l, iv_u = wls_prediction_std( + results, predictors, alpha=alpha) + data['se'] = prstd + data['ymin'] = iv_l + data['ymax'] = iv_u + return data + + def glm(data, xseq, **params): """ Fit GLM """ + if params['formula']: + return glm_formula(data, xseq, **params) + X = sm.add_constant(data['x']) Xseq = sm.add_constant(xseq) @@ -146,6 +266,29 @@ def glm(data, xseq, **params): return data +def glm_formula(data, xseq, **params): + eval_env = params['enviroment'] + init_kwargs, fit_kwargs = separate_method_kwargs( + params['method_args'], sm.GLM, sm.GLM.fit) + model = smf.glm( + params['formula'], + data, + eval_env=eval_env, + **init_kwargs + ) + results = model.fit(**fit_kwargs) + data = pd.DataFrame({'x': xseq}) + data['y'] = results.predict(data) + + if params['se']: + df = pd.DataFrame({'x': xseq}) + prediction = results.get_prediction(df) + ci = prediction.conf_int(1 - params['level']) + data['ymin'] = ci[:, 0] + data['ymax'] = ci[:, 1] + return data + + def lowess(data, xseq, **params): for k in ('is_sorted', 'return_sorted'): with suppress(KeyError): @@ -284,7 +427,10 @@ def tdist_ci(x, df, stderr, level): Confidence Intervals using the t-distribution """ q = (1 + level)/2 - delta = stats.t.ppf(q, df) * stderr + if df is None: + delta = stats.norm.ppf(q) * stderr + else: + delta = stats.t.ppf(q, df) * stderr return x - delta, x + delta diff --git a/plotnine/stats/stat.py b/plotnine/stats/stat.py index c9d091c01e..ffc5ce89e1 100644 --- a/plotnine/stats/stat.py +++ b/plotnine/stats/stat.py @@ -38,6 +38,9 @@ class stat(metaclass=Registry): # their default values. _aesthetics_doc = '{aesthetics_table}' + # The namespace in which the ggplot object is invoked + environment = None + def __init__(self, mapping=None, data=None, **kwargs): kwargs = data_mapping_as_kwargs((mapping, data), kwargs) self._kwargs = kwargs # Will be used to create the geom @@ -107,9 +110,12 @@ def __deepcopy__(self, memo): old = self.__dict__ new = result.__dict__ + # don't make a _kwargs and environment + shallow = {'_kwargs', 'environment'} for key, item in old.items(): - if key == '_kwargs': + if key in shallow: new[key] = old[key] + memo[id(new[key])] = new[key] else: new[key] = deepcopy(old[key], memo) @@ -358,6 +364,7 @@ def __radd__(self, gg, inplace=False): ggplot object with added layer """ gg = gg if inplace else deepcopy(gg) + self.environment = gg.environment gg += self.to_layer() # Add layer return gg diff --git a/plotnine/stats/stat_smooth.py b/plotnine/stats/stat_smooth.py index e3d5d23390..3cbcfc9074 100644 --- a/plotnine/stats/stat_smooth.py +++ b/plotnine/stats/stat_smooth.py @@ -76,6 +76,12 @@ def my_smoother(data, xseq, **params): data['ymax'] = high return data + formula : formula_like + An object that can be used to construct a patsy design matrix. + This is usually a string. You can only use a formula if ``method`` + is one of *lm*, *ols*, *wls*, *glm*, *rlm* or *gls*, and in the + :ref:`formula ` you may refer to the ``x`` and + ``y`` aesthetic variables. se : bool (default: True) If :py:`True` draw confidence interval around the smooth line. n : int (default: 80) @@ -131,6 +137,7 @@ def my_smoother(data, xseq, **params): DEFAULT_PARAMS = {'geom': 'smooth', 'position': 'identity', 'na_rm': False, 'method': 'auto', 'se': True, 'n': 80, + 'formula': None, 'fullrange': False, 'level': 0.95, 'span': 0.75, 'method_args': {}} CREATES = {'se', 'ymin', 'ymax'} @@ -168,6 +175,15 @@ def setup_params(self, data): "facets".format(window), PlotnineWarning) params['method_args']['window'] = window + if params['formula']: + allowed = {'lm', 'ols', 'wls', 'glm', 'rlm', 'gls'} + if params['method'] not in allowed: + raise ValueError( + "You can only use a formula with `method` is " + "one of {}".format(allowed) + ) + params['enviroment'] = self.environment + return params @classmethod diff --git a/plotnine/tests/baseline_images/test_geom_smooth/glm_formula.png b/plotnine/tests/baseline_images/test_geom_smooth/glm_formula.png new file mode 100644 index 0000000000..8ed229c603 Binary files /dev/null and b/plotnine/tests/baseline_images/test_geom_smooth/glm_formula.png differ diff --git a/plotnine/tests/baseline_images/test_geom_smooth/gls_formula.png b/plotnine/tests/baseline_images/test_geom_smooth/gls_formula.png new file mode 100644 index 0000000000..3cf4361252 Binary files /dev/null and b/plotnine/tests/baseline_images/test_geom_smooth/gls_formula.png differ diff --git a/plotnine/tests/baseline_images/test_geom_smooth/lm_formula.png b/plotnine/tests/baseline_images/test_geom_smooth/lm_formula.png new file mode 100644 index 0000000000..3cf4361252 Binary files /dev/null and b/plotnine/tests/baseline_images/test_geom_smooth/lm_formula.png differ diff --git a/plotnine/tests/baseline_images/test_geom_smooth/lm_formula_weights.png b/plotnine/tests/baseline_images/test_geom_smooth/lm_formula_weights.png new file mode 100644 index 0000000000..896baa815c Binary files /dev/null and b/plotnine/tests/baseline_images/test_geom_smooth/lm_formula_weights.png differ diff --git a/plotnine/tests/baseline_images/test_geom_smooth/rlm_formula.png b/plotnine/tests/baseline_images/test_geom_smooth/rlm_formula.png new file mode 100644 index 0000000000..4c8f99f7aa Binary files /dev/null and b/plotnine/tests/baseline_images/test_geom_smooth/rlm_formula.png differ diff --git a/plotnine/tests/test_geom_smooth.py b/plotnine/tests/test_geom_smooth.py index 51240cbb9b..6d7b07795f 100644 --- a/plotnine/tests/test_geom_smooth.py +++ b/plotnine/tests/test_geom_smooth.py @@ -4,7 +4,7 @@ import statsmodels.api as sm -from plotnine import ggplot, aes, geom_point, geom_smooth +from plotnine import ggplot, aes, geom_point, geom_smooth, stat_smooth from plotnine.exceptions import PlotnineWarning @@ -185,3 +185,69 @@ def test_init_and_fit_kwargs(): ) assert p == 'init_and_fit_kwargs' + + +n = 100 +random_state = np.random.RandomState(123) +mu = 0 +sigma = 0.065 +noise = random_state.randn(n)*sigma + mu +df = pd.DataFrame({ + 'x': x, + 'y': np.sin(x) + noise, +}) + + +class TestFormula: + + p = ggplot(df, aes('x', 'y')) + geom_point() + + def test_lm(self): + p = (self.p + + stat_smooth( + method='lm', + formula='y ~ np.sin(x)', + fill='red', + se=True + )) + assert p == 'lm_formula' + + def test_lm_weights(self): + p = (self.p + + aes(weight='x.abs()') + + stat_smooth( + method='lm', + formula='y ~ np.sin(x)', + fill='red', + se=True + )) + assert p == 'lm_formula_weights' + + def test_glm(self): + p = (self.p + + stat_smooth( + method='glm', + formula='y ~ np.sin(x)', + fill='red', + se=True + )) + assert p == 'glm_formula' + + def test_rlm(self): + p = (self.p + + stat_smooth( + method='rlm', + formula='y ~ np.sin(x)', + fill='red', + )) + assert p == 'rlm_formula' + + def test_gls(self): + p = (self.p + + stat_smooth( + method='gls', + formula='y ~ np.sin(x)', + fill='red', + se=True + )) + assert p == 'gls_formula' diff --git a/setup.py b/setup.py index 4c10bac6da..cce1ef945f 100644 --- a/setup.py +++ b/setup.py @@ -52,9 +52,9 @@ def get_required_packages(): 'matplotlib >= 3.1.1', 'numpy >= 1.16.0', 'scipy >= 1.2.0', - 'patsy >= 0.4.1', - 'statsmodels >= 0.9.0', - 'pandas >= 0.25.0', + 'patsy >= 0.5.1', + 'statsmodels >= 0.11.1', + 'pandas >= 1.0.3', # 'geopandas >= 0.3.0', 'descartes >= 1.1.0' ]