forked from behrman/ros
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgay_tv.Rmd
More file actions
206 lines (166 loc) · 4.62 KB
/
gay_tv.Rmd
File metadata and controls
206 lines (166 loc) · 4.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
---
title: "Regression and Other Stories: Gay"
author: "Andrew Gelman, Aki Vehtari"
date: "`r Sys.Date()`"
output:
github_document:
toc: true
---
Tidyverse version by Bill Behrman.
Nonlinear models (LOESS and spline) and political
attitudes as a function of age. See Chapter 22 in Regression and
Other Stories.
-------------
```{r, message=FALSE}
# Packages
library(tidyverse)
library(rstanarm)
# Parameters
# NAES survey data
file_naes <- here::here("Gay/data/naes04.csv")
# Common code
file_common <- here::here("_common.R")
#===============================================================================
# Run common code
source(file_common)
```
# 22 Advanced regression and multilevel models
## 22.7 Nonparametric regression and machine learning
### Demonstration: political attitudes as a function of age
Data
```{r, message=FALSE, warning=FALSE}
naes <-
file_naes %>%
read_csv() %>%
select(!...1)
naes
```
Let's understand the `NA`s in the data.
```{r}
v <-
naes %>%
summarize(across(everything(), ~ sum(is.na(.)) / n()))
glimpse(v)
```
The variables we wish to study have a high percentage of `NA`s. `gayFavorStateMarriage` has `r format(100 * v$gayFavorStateMarriage, digits = 1, nsmall = 1)`% `NA`s, and `gayKnowSomeone` has `r format(100 * v$gayKnowSomeone, digits = 1, nsmall = 1)`% `NA`s. This could be a problem, but we will ignore it.
Let's now look at `age`.
```{r}
age_count <- function(var) {
naes %>%
drop_na({{ var }}) %>%
count(age) %>%
arrange(desc(age))
}
age_count(gayFavorStateMarriage)
age_count(gayKnowSomeone)
```
There are very few respondents over 90, so we will combine those over 90 to an age of 91. We'll also drop those for whom `age` is `NA`.
```{r}
naes <-
naes %>%
drop_na(age) %>%
mutate(age = if_else(age <= 90, age, 91))
```
Create indicator variable `y` from `var` and add to naes.
```{r}
indicator <- function(var) {
naes %>%
drop_na({{ var }}) %>%
mutate(
y =
case_when(
{{ var }} == "Yes" ~ 1,
{{ var }} == "No" ~ 0,
TRUE ~ NA_real_
)
)
}
```
Return tibble with proportion of "Yes" responses and total number of responses for each age for variable `var`.
```{r}
yes_prop <- function(var) {
naes %>%
drop_na({{ var }}) %>%
group_by(age) %>%
summarize(
y = sum({{ var }} == "Yes") / n(),
n = n()
)
}
```
Fit model to data using loess or splines; then predict using fit over `age` range.
```{r}
pred <- function(var, method = c("loess", "splines")) {
method <- match.arg(method)
if (method == "loess") {
data <- indicator({{ var }})
fit <- loess(y ~ age, data = data)
tibble(
age = seq_range(data$age),
y = predict(fit, newdata = tibble(age))
)
} else if (method == "splines") {
data <- yes_prop({{ var }})
fit <- stan_gamm4(y ~ s(age), data = data, refresh = 0, adapt_delta = 0.99)
tibble(age = seq_range(data$age)) %>%
predictive_intervals(fit = fit)
}
}
```
Plot data and model prediction.
```{r}
plot <- function(var, method = c("", "loess", "splines")) {
method <- match.arg(method)
title <-
tibble(
gayFavorStateMarriage = "Support for same-sex marriage",
gayKnowSomeone = "Do you know any gay people?"
)
plot <-
yes_prop({{ var }}) %>%
ggplot(aes(age)) +
geom_point(aes(y = y, size = n), shape = "circle filled", fill = "grey75") +
coord_cartesian(ylim = c(0, NA)) +
scale_x_continuous(breaks = scales::breaks_width(10)) +
scale_y_continuous(
breaks = scales::breaks_width(0.1),
labels = scales::label_percent(accuracy = 1)
) +
theme(legend.position = "none") +
labs(
title = title %>% pull({{ var }}),
x = "Age",
y = "Percentage of yes responses"
)
if (method == "loess") {
plot <-
plot +
geom_line(aes(age, y), data = pred(var = {{ var }}, method = "loess")) +
labs(subtitle = "Loess fit")
} else if (method == "splines") {
v <- pred(var = {{ var }}, method = "splines")
plot <-
plot +
geom_ribbon(aes(ymin = `5%`, ymax = `95%`), data = v, alpha = 0.25) +
geom_ribbon(aes(ymin = `25%`, ymax = `75%`), data = v, alpha = 0.5) +
geom_line(aes(y = .pred), data = v) +
labs(subtitle = "Spline fit, with 50% and 90% predictive intervals")
}
plot
}
```
Plots of data.
```{r, fig.asp=0.72}
vars(gayFavorStateMarriage, gayKnowSomeone) %>%
map(plot) %>%
walk(print)
```
Plots of data and model predictions.
```{r, fig.asp=0.75}
expand_grid(
var = vars(gayFavorStateMarriage, gayKnowSomeone),
method = c("loess", "splines")
) %>%
pmap(plot) %>%
walk(print)
```