From e8c0f4a638b07d06f0f90d185782bf42a8cc9610 Mon Sep 17 00:00:00 2001 From: Randall Pruim Date: Thu, 10 Sep 2015 08:59:38 -0400 Subject: [PATCH 1/3] version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a0b41825..ea1b7443 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: mosaic Type: Package Title: Project MOSAIC Statistics and Mathematics Teaching Utilities -Version: 0.11 +Version: 0.11.9000 Date: 2015-09-10 Depends: R (>= 3.0.0), From 035f6bcb6494214d0c8ddd63e8dd1dfc3b7ab71d Mon Sep 17 00:00:00 2001 From: Randall Pruim Date: Thu, 10 Sep 2015 11:13:15 -0400 Subject: [PATCH 2/3] add stub of plotModel() vignette --- vignettes/plotModel-vignette.Rmd | 90 ++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 vignettes/plotModel-vignette.Rmd diff --git a/vignettes/plotModel-vignette.Rmd b/vignettes/plotModel-vignette.Rmd new file mode 100644 index 00000000..f3512b8a --- /dev/null +++ b/vignettes/plotModel-vignette.Rmd @@ -0,0 +1,90 @@ +--- +title: "Visualizing Models with plotModel()" +author: "Ben Baumer" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +require(mosaic) +``` + +For now this is just a dump of a bunch of plots. It needs to be turned into a vignette. + +## lattice + +```{r} +mod <- lm( mpg ~ factor(cyl), data = mtcars) +plotModel(mod) + +# SLR +mod <- lm( mpg ~ wt, data = mtcars) +plotModel(mod, pch = 19) + +# parallel slopes +mod <- lm( mpg ~ wt + factor(cyl), data=mtcars) +plotModel(mod) + +# multiple categorical vars +mod <- lm( mpg ~ wt + factor(cyl) + factor(vs) + factor(am), data = mtcars) +plotModel(mod) +plotModel(mod, key = ~am) + +# interaction +mod <- lm( mpg ~ wt + factor(cyl) + wt:factor(cyl), data = mtcars) +plotModel(mod) + +# polynomial terms +mod <- lm( mpg ~ wt + I(wt^2), data = mtcars) +plotModel(mod) + +# GLM +mod <- glm(vs ~ wt, data=mtcars, family = 'binomial') +plotModel(mod) + +# GLM with interaction +mod <- glm(vs ~ wt + factor(cyl), data=mtcars, family = 'binomial') +plotModel(mod) +``` + + +## ggplot2 + +```{r} +mod <- lm( mpg ~ factor(cyl), data = mtcars) +plotModel(mod, system="g") + +# SLR +mod <- lm( mpg ~ wt, data = mtcars) +plotModel(mod, system="g") + +# parallel slopes +mod <- lm( mpg ~ wt + factor(cyl), data=mtcars) +plotModel(mod, system = "g") + +# multiple categorical vars +mod <- lm( mpg ~ wt + factor(cyl) + factor(vs) + factor(am), data = mtcars) +plotModel(mod, system="g") +plotModel(mod, key = ~am, system="g") + +# interaction +mod <- lm( mpg ~ wt + factor(cyl) + wt:factor(cyl), data = mtcars) +plotModel(mod, system="g") + +# polynomial terms +mod <- lm( mpg ~ wt + I(wt^2), data = mtcars) +plotModel(mod, system="g") + +# GLM +mod <- glm(vs ~ wt, data=mtcars, family = 'binomial') +plotModel(mod, system="g") + +# GLM with interaction +mod <- glm(vs ~ wt + factor(cyl), data=mtcars, family = 'binomial') +plotModel(mod, system="g") +``` From 6e87d1a0d8319f84a12137ff4050beacd8843b62 Mon Sep 17 00:00:00 2001 From: Ben Baumer Date: Thu, 10 Sep 2015 14:36:54 -0400 Subject: [PATCH 3/3] first slog through vignette --- vignettes/plotModel-vignette.Rmd | 116 ++++++++++++++++++++++++++++++- vignettes/refs.bib | 10 +++ 2 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 vignettes/refs.bib diff --git a/vignettes/plotModel-vignette.Rmd b/vignettes/plotModel-vignette.Rmd index f3512b8a..ad79165e 100644 --- a/vignettes/plotModel-vignette.Rmd +++ b/vignettes/plotModel-vignette.Rmd @@ -3,6 +3,7 @@ title: "Visualizing Models with plotModel()" author: "Ben Baumer" date: "`r Sys.Date()`" output: rmarkdown::html_vignette +bibliography: refs.bib vignette: > %\VignetteIndexEntry{Vignette Title} %\VignetteEngine{knitr::rmarkdown} @@ -14,7 +15,118 @@ knitr::opts_chunk$set(echo = TRUE) require(mosaic) ``` -For now this is just a dump of a bunch of plots. It needs to be turned into a vignette. +#### Introduction + +Intuitively, visualizing a statistical model seems an effective way of understanding what a model **is**. For many people, a geometric presentation of a abstract concept -- such as a statistical model -- enables a conceptual grasp that remains elusive from an algebraic presentation alone. To aid in model comprehension, educators have advocated persuasively for displaying "models in the data space." [@wickham2015visualizing]. For the case of simple linear regression, the geometric presentation of the model as a line in the place that cuts through the data points in the plane is ubiquitous. But what about other kinds of models? What about multipl regression models, non-linear models, higher-dimensional models, generalized linear models, and models that contain interaction terms? Wouldn't it be helpful to visualize these models in the data space as well? + +Of course, existing packages in R provide the functionality to do all of these things. But in most of the non-trivial cases, overlaying the model and the data in the same plot requires far more advanced programming ability than introductory and intermediate statistics students are likely to have. But these students are precisely those who would benefit most from seeing their models in the data space. The `plotModel()` function in the `mosaic` package provides functionality to effortlessly visualize a variety of non-trivial statistical models on top of the data upon which the model was fit. + +#### Motivation + +Consider the following hypothetical exchange between a student in an introductory statistics class and her instructor. + +> Student: I want to build a model for $y$ in terms of $x$. How do I do that in R? + +Instructor: That's easy, just type: + +```{r, eval=FALSE} +mod <- lm(y ~ x, data=ds) +``` + +> Student: What does my model look like? + +Instructor: That's easy, it's a *line* in the $xy$-plane: + +```{r, eval=FALSE} +xyplot(y ~ x, data=ds, type=c("p", "r")) +``` + +> Student: I also want to control for the binary variable $z$. How do I add that to my model? + +Instructor: That's easy, just add it to the formula: + +```{r, eval=FALSE} +mod <- lm(y ~ x + z, data=ds) +``` + +> Student: Oh cool, what does my model look like now? + +Instructor: That's easy, it's now *two parallel lines*! + +> Student: Neat. How do I plot that in R? + +Instructor: [baffled] Um...well...it's somewhat complicated...you have to grab the right colors from the trellis palette...and then figure out the equations for the lines... + +**Instead** + +Instructor: That's easy, just type: + +```{r, eval=FALSE} +plotModel(mod) +``` + +#### Example: Regression to the Mean + +Consider Francis Galton's famous data set about the beights of adult children and their parents, made available in the `mosaicData` package. + +As a first inquiry, we might consider a model for an adult's `height` as a function of their `father`'s height. + +```{r, slr} +slr <- lm(height ~ father, data = Galton) +``` + +Since the resulting `lm` object contains all of the data on which the model was fit, the model object contains all the information we need to plot this model in the data space. + +```{r, slr-plot} +plotModel(slr) +``` + +The default underlying graphics engine is `lattice`, but if we prefer a `ggplot` graphic instead, we can specify the graphics library using the `system` argument. + +```{r, slr-plot-ggplot} +plotModel(slr, system = "g") +``` + +We know that Galton's time, in general, men tend to be taller than women. Thus it is plausible to think that the binary variable `sex` might play a role here. It is easy to add this to our model, but we can now as easily plot the model in the data space with no additional effort. + +```{r, parallel} +pslopes <- lm(height ~ father + sex, data = Galton) +plotModel(pslopes) +``` + +The geometric term for such a model -- a "parallel slopes" model -- is now obvious from the plot in a way that it was not necessarily obvious from the algebraic specfication for the model. + +We can also pass additional parameters to the underlying `panel.xyplot()` function if we want to gussy up our plot. + +```{r, parallel-dots} +plotModel(pslopes, alpha = 0.3, pch = 19, cex = 0.8) +``` + +We could of course, have one parallel line for each level of a categorical variable. + +```{r} +mslopes <- lm(height ~ father + sex + nkids, data = Galton) +plotModel(mslopes) +``` + +Or, we could combine multiple categorical variables. + +```{r} +library(magrittr) +Galton %<>% + mutate(onlyChild = ifelse(nkids == 1, TRUE, FALSE)) +mslopes <- lm(height ~ father + sex + onlyChild, data = Galton) +plotModel(mslopes) +``` + +If two explanatory variables be associated with the response in a non-linear fashion, then an interaction term could be included in our model. In this case, we might consider an interaction between the father's height and the gender of the child. + +```{r, non-parallel} +npslopes <- lm(height ~ father + sex + father:sex, data = Galton) +plotModel(npslopes) +``` + +It may be hard to see in the plot, but these lines are in fact *not* parallel. ## lattice @@ -88,3 +200,5 @@ plotModel(mod, system="g") mod <- glm(vs ~ wt + factor(cyl), data=mtcars, family = 'binomial') plotModel(mod, system="g") ``` + +## References diff --git a/vignettes/refs.bib b/vignettes/refs.bib new file mode 100644 index 00000000..361f3f07 --- /dev/null +++ b/vignettes/refs.bib @@ -0,0 +1,10 @@ +@article{wickham2015visualizing, + title={Visualizing statistical models: Removing the blindfold}, + author={Wickham, Hadley and Cook, Dianne and Hofmann, Heike}, + journal={Statistical Analysis and Data Mining: The ASA Data Science Journal}, + volume={8}, + number={4}, + pages={203--225}, + year={2015}, + publisher={Wiley Online Library} +} \ No newline at end of file