|
| 1 | +--- |
| 2 | +title: "Practicing categorical variables" |
| 3 | +author: "Angela Zoss" |
| 4 | +date: "3/5/2023" |
| 5 | +output: html_document |
| 6 | +--- |
| 7 | + |
| 8 | +## Load packages |
| 9 | + |
| 10 | +```{r} |
| 11 | +
|
| 12 | +library(tidyverse) |
| 13 | +library(colorspace) |
| 14 | +library(readxl) |
| 15 | +library(here) |
| 16 | +here::i_am("Day 2/3-categorical-variables-final.Rmd") |
| 17 | +
|
| 18 | +``` |
| 19 | + |
| 20 | +## Load your data |
| 21 | + |
| 22 | +```{r} |
| 23 | +
|
| 24 | +# We're going to load this dataset in a different way this time. We're going to |
| 25 | +# leave some of the factors out of it so we can practice making them ourselves. |
| 26 | +
|
| 27 | +inclusiveness_index_unfactored <- |
| 28 | + read_excel(here("data", "inclusiveness_index", "global_data_for_website_2020.xlsx"), |
| 29 | + na="9999") %>% |
| 30 | + rename_with(~ str_remove_all(.x, "[(|)]") %>% str_replace_all("[-| ]", ".")) %>% |
| 31 | + dplyr::filter(Continent != "Antarctica") |
| 32 | +
|
| 33 | +``` |
| 34 | + |
| 35 | +## Add factors based on data meaning (natural ordering) |
| 36 | + |
| 37 | +```{r} |
| 38 | +
|
| 39 | +# The Index.categories.2020 variable is a categorical variable with a natural |
| 40 | +# ordering: High to Low (or High to No data). |
| 41 | +
|
| 42 | +# See what a plot looks like when we just use the data without creating a factor. |
| 43 | +
|
| 44 | +ggplot(data = inclusiveness_index_unfactored) + |
| 45 | + geom_bar(mapping = aes(x = Continent, |
| 46 | + fill = Index.categories.2020)) + |
| 47 | + scale_fill_discrete_sequential(palette = "Purple-Blue") |
| 48 | +
|
| 49 | +``` |
| 50 | + |
| 51 | +```{r} |
| 52 | +# To ensure the variable is always displayed with the correct order, first determine |
| 53 | +# the order by looking at the possible values. Then create a vector with the |
| 54 | +# correct order and use that vector to set new levels for the factor. |
| 55 | +
|
| 56 | +unique(inclusiveness_index_unfactored$Index.categories.2020) |
| 57 | +
|
| 58 | +index_cat_levels <- c("No data", "Low", "Medium-Low", "Medium", "Medium-High", "High") |
| 59 | +
|
| 60 | +inclusiveness_index_factored_natural <- inclusiveness_index_unfactored %>% |
| 61 | + mutate(Index.categories.2020 = Index.categories.2020 %>% as_factor() %>% |
| 62 | + fct_relevel(index_cat_levels)) |
| 63 | +
|
| 64 | +``` |
| 65 | + |
| 66 | +```{r} |
| 67 | +
|
| 68 | +# And here is exactly the same chart again, just using the modified dataset. |
| 69 | +
|
| 70 | +ggplot(data = inclusiveness_index_factored_natural) + |
| 71 | + geom_bar(mapping = aes(x = Continent, |
| 72 | + fill = Index.categories.2020)) + |
| 73 | + scale_fill_discrete_sequential(palette = "Purple-Blue") |
| 74 | +
|
| 75 | +
|
| 76 | +``` |
| 77 | + |
| 78 | +## Add factors based on data patterns |
| 79 | + |
| 80 | +```{r} |
| 81 | +
|
| 82 | +# Let's look again at this bar chart showing average inclusiveness index by |
| 83 | +# continent. What do you think about the order of the bars? |
| 84 | +
|
| 85 | +inclusiveness_index_factored_natural %>% |
| 86 | + drop_na(Inclusiveness.index.2020) %>% |
| 87 | + group_by(Continent) %>% |
| 88 | + summarise(avg_inclusiveness = mean(Inclusiveness.index.2020)) %>% |
| 89 | + ggplot(mapping = aes(x = Continent, y = avg_inclusiveness)) + |
| 90 | + geom_col() |
| 91 | +
|
| 92 | +``` |
| 93 | + |
| 94 | +```{r} |
| 95 | +
|
| 96 | +# For a chart like this, ordering the bars by highest to lowest can make it easier |
| 97 | +# to see the patterns in the data. For this kind of factor, though, we don't |
| 98 | +# necessarily want to make a permanent change to the dataset. The order of the |
| 99 | +# continents that makes sense for this chart may not make sense for other views |
| 100 | +# of the data. |
| 101 | +
|
| 102 | +# In cases like these, we can create a factor as part of the data processing |
| 103 | +# and then pipe it right into the chart, instead of storing that factor as |
| 104 | +# part of the data frame. |
| 105 | +
|
| 106 | +inclusiveness_index_factored_natural %>% |
| 107 | + drop_na(Inclusiveness.index.2020) %>% |
| 108 | + group_by(Continent) %>% |
| 109 | + summarise(avg_inclusiveness = mean(Inclusiveness.index.2020)) %>% |
| 110 | + mutate(Continent = Continent %>% as_factor() %>% fct_reorder(avg_inclusiveness)) %>% |
| 111 | + ggplot(mapping = aes(x = Continent, y = avg_inclusiveness)) + |
| 112 | + geom_col() |
| 113 | +
|
| 114 | +# or in reverse order |
| 115 | +
|
| 116 | +inclusiveness_index_factored_natural %>% |
| 117 | + drop_na(Inclusiveness.index.2020) %>% |
| 118 | + group_by(Continent) %>% |
| 119 | + summarise(avg_inclusiveness = mean(Inclusiveness.index.2020)) %>% |
| 120 | + mutate(Continent = Continent %>% as_factor() %>% fct_reorder(avg_inclusiveness) %>% fct_rev) %>% |
| 121 | + ggplot(mapping = aes(x = Continent, y = avg_inclusiveness)) + |
| 122 | + geom_col() |
| 123 | +
|
| 124 | +``` |
| 125 | + |
| 126 | +## Rotating a plot with long names |
| 127 | + |
| 128 | +```{r} |
| 129 | +
|
| 130 | +# This plot shows the inclusiveness index for European countries, |
| 131 | +# without the null values. But you'll have to take my word for it! |
| 132 | +
|
| 133 | +inclusiveness_index_factored_natural %>% |
| 134 | + drop_na(Inclusiveness.index.2020) %>% |
| 135 | + dplyr::filter(Continent == "Europe") %>% |
| 136 | + ggplot(mapping = aes(x = Country, y = Inclusiveness.index.2020)) + |
| 137 | + geom_col() |
| 138 | +
|
| 139 | +``` |
| 140 | + |
| 141 | +```{r} |
| 142 | +
|
| 143 | +# Let's go ahead and flip the axes so we can see the labels. |
| 144 | +
|
| 145 | +inclusiveness_index_factored_natural %>% |
| 146 | + drop_na(Inclusiveness.index.2020) %>% |
| 147 | + dplyr::filter(Continent == "Europe") %>% |
| 148 | + ggplot(mapping = aes(y = Country, x = Inclusiveness.index.2020)) + |
| 149 | + geom_col() |
| 150 | +
|
| 151 | +``` |
| 152 | + |
| 153 | + |
| 154 | +```{r} |
| 155 | +
|
| 156 | +# Now we can also sort the bars based on the inclusiveness index by |
| 157 | +# creating a factor on Country. Remember, the axes "start" from the |
| 158 | +# bottom left-hand corner, so what would normally be on the left on the |
| 159 | +# x axis will be on the bottom on the y axis. |
| 160 | +
|
| 161 | +
|
| 162 | +inclusiveness_index_factored_natural %>% |
| 163 | + drop_na(Inclusiveness.index.2020) %>% |
| 164 | + dplyr::filter(Continent == "Europe") %>% |
| 165 | + mutate(Country = Country %>% as_factor() %>% fct_reorder(Inclusiveness.index.2020)) %>% |
| 166 | + ggplot(mapping = aes(y = Country, x = Inclusiveness.index.2020)) + |
| 167 | + geom_col() |
| 168 | +
|
| 169 | +``` |
| 170 | + |
| 171 | +## Additional charts for categorical data |
| 172 | + |
| 173 | +### Pie Chart |
| 174 | + |
| 175 | +```{r} |
| 176 | +
|
| 177 | +# A pie chart works a lot like a bar chart. It takes one categorical variable |
| 178 | +# and one number. But ggplot2 struggles a bit to visualize pie charts because |
| 179 | +# they don't operate on a normal x-y coordinate plane. |
| 180 | +
|
| 181 | +# The first stage of a pie chart is actually a stacked bar chart with one bar. |
| 182 | +# Here we've used the y aesthetic, so the bar is horizontal. We don't actually |
| 183 | +# have to specify a variable for y, though, because we don't want multiple bars |
| 184 | +# along the y axis. We just want one. So we can give it a single value, like |
| 185 | +# "". |
| 186 | +
|
| 187 | +# Then we fill the bar with the variable we want to use for our different |
| 188 | +# pie wedges. Here we're using index categories. |
| 189 | +
|
| 190 | +inclusiveness_index_factored_natural %>% |
| 191 | + ggplot(aes(y = "", fill = Index.categories.2020)) + |
| 192 | + geom_bar(position=position_fill(), color="#cccccc") + |
| 193 | + scale_fill_discrete_sequential(palette = "Purple-Blue") |
| 194 | +
|
| 195 | +
|
| 196 | +``` |
| 197 | + |
| 198 | +```{r} |
| 199 | +
|
| 200 | +# The final step in the pie chart is to change the coordinate system. The default |
| 201 | +# coordinate system for a bar chart is Cartesian. We can use a coord_polar() layer |
| 202 | +# to change this to a polar coordinate system, which starts at the "12:00" position |
| 203 | +# and rotaties one way or the other. We specify "direction = -1" to have it rotate |
| 204 | +# counter-clockwise, putting the dark blue color on the left and the gray color |
| 205 | +# on the right. This is just personal preference - I like the largest wedge on the |
| 206 | +# right. |
| 207 | +
|
| 208 | +# At this point, you can adjust the theme of the plot to get it to look better, but |
| 209 | +# ggplot2 doesn't have great support for making pie charts look nice. |
| 210 | +
|
| 211 | +inclusiveness_index_factored_natural %>% |
| 212 | + ggplot(aes(y = "", fill = Index.categories.2020)) + |
| 213 | + geom_bar(position=position_fill(), color="#cccccc") + |
| 214 | + coord_polar(direction = -1) + |
| 215 | + scale_fill_discrete_sequential(palette = "Purple-Blue") |
| 216 | +
|
| 217 | +``` |
| 218 | + |
| 219 | + |
| 220 | +### Heat Map |
| 221 | + |
| 222 | +```{r} |
| 223 | +
|
| 224 | +# When you have two categorical variables you want to show in the same plot, |
| 225 | +# you can add color to a bar chart, but that's not always the most effective |
| 226 | +# way to explore the data. For example, stacked bars make it easy to compare |
| 227 | +# the bar segments on the bottom and the overall heights of the bars, but the |
| 228 | +# values in the middle are very different to compare. |
| 229 | +
|
| 230 | +# One alternative to stacked bars is a heat map, or a table where you place |
| 231 | +# each categorical variable on one axis, and then the cell at the intersection |
| 232 | +# of those categories is colored to show a numerical value. Take the example |
| 233 | +# below, where we are splitting the data up into continents and index categories |
| 234 | +# and then using color to show how many data points there are in each bin. |
| 235 | +
|
| 236 | +inclusiveness_index_factored_natural %>% |
| 237 | + count(Continent, Index.categories.2020) %>% |
| 238 | + ggplot(aes(x = Continent, y = Index.categories.2020, fill = n)) + |
| 239 | + geom_tile() |
| 240 | +
|
| 241 | +``` |
| 242 | + |
| 243 | +```{r} |
| 244 | +
|
| 245 | +# Another reason heat maps may be more effective is if the data you are trying |
| 246 | +# to show includes both postivie and negative numbers, or numbers that shouldn't |
| 247 | +# be added together. Stacked bar charts strongly suggest that the sum of the |
| 248 | +# numbers in the bar is meaningful, that it's important to know the total. That |
| 249 | +# isn't true for the dataset below, where we have calculated the average |
| 250 | +# inclusiveness index and other sub-indices for the various continents. Not only |
| 251 | +# are there negative values in this dataset, but it is not meaningful to add |
| 252 | +# up the averages of the different indices. |
| 253 | +
|
| 254 | +# This heat map breaks down the data so we can see on average how the countries |
| 255 | +# in each continent compare on the different sub-indices, as well as the overall |
| 256 | +# inclusiveness index. |
| 257 | +
|
| 258 | +inclusiveness_index_factored_natural %>% |
| 259 | + pivot_longer(cols = c(Inclusiveness.index.2020, starts_with("Z")), |
| 260 | + names_to = "index_name", |
| 261 | + values_to = "index_values", |
| 262 | + values_drop_na = TRUE) %>% |
| 263 | + group_by(Continent, index_name) %>% |
| 264 | + summarise(mean_incl_ind = mean(index_values), .groups="drop") %>% |
| 265 | + mutate(index_name = index_name %>% |
| 266 | + as_factor() %>% |
| 267 | + fct_reorder(mean_incl_ind), |
| 268 | + Continent = Continent %>% |
| 269 | + as_factor() %>% |
| 270 | + fct_reorder(mean_incl_ind) %>% |
| 271 | + fct_rev()) %>% |
| 272 | + ggplot(aes(y = index_name, x = Continent, fill = mean_incl_ind)) + |
| 273 | + geom_tile() + |
| 274 | + scale_fill_distiller(type="div", palette = 3, |
| 275 | + limits=c(-.8,.8), |
| 276 | + direction = -1) |
| 277 | +
|
| 278 | +``` |
| 279 | + |
| 280 | +### Dumbbell plot |
| 281 | + |
| 282 | +```{r} |
| 283 | +
|
| 284 | +# Another limitation of bar charts is when you want to make direct numerical |
| 285 | +# comparisons across series. The heatmap can show two continents next to each |
| 286 | +# other, but it's hard to directly compare shades of colors to see data patterns. |
| 287 | +# On the other hand, even side by side bar charts can make it hard to see these |
| 288 | +# patterns. |
| 289 | +
|
| 290 | +# Imagine we want to compare Asia and Europe in terms of the number of |
| 291 | +# countries in each index category. |
| 292 | +
|
| 293 | +# The first plot groups all of the segments by continent, which makes it easy |
| 294 | +# to compare different Index categories within a single continent. This plot |
| 295 | +# shows a version of a bar chart we haven't talked about - a grouped bar chart, |
| 296 | +# which you create in ggplot2 with "position = position_dodge()". |
| 297 | +
|
| 298 | +
|
| 299 | +ggplot( |
| 300 | + inclusiveness_index_factored_natural %>% |
| 301 | + dplyr::filter(Continent %in% c("Asia", "Europe")), |
| 302 | + mapping = aes(x = Continent, fill = Index.categories.2020)) + |
| 303 | + geom_bar(position = position_dodge()) + |
| 304 | + scale_fill_discrete_sequential(palette = "Purple-Blue") |
| 305 | +
|
| 306 | +``` |
| 307 | + |
| 308 | +```{r} |
| 309 | +# What if we want to bring more attention to the difference between continents for |
| 310 | +# each category? We could always switch which category is the primary division on |
| 311 | +# the x axis and which is represented by color. |
| 312 | +
|
| 313 | +ggplot( |
| 314 | + inclusiveness_index_factored_natural %>% |
| 315 | + dplyr::filter(Continent %in% c("Asia", "Europe")), |
| 316 | + mapping = aes(fill = Continent, x = Index.categories.2020)) + |
| 317 | + geom_bar(position = position_dodge()) |
| 318 | +
|
| 319 | +``` |
| 320 | + |
| 321 | +```{r} |
| 322 | +# This improves our ability to compare the continents directly because the bars |
| 323 | +# are directly next to each other. The amount of space the bars take up is still |
| 324 | +# pretty large, though. If we combine this chart with something like a scatter plot, |
| 325 | +# we get one last variation: a dumbbell plot, |
| 326 | +
|
| 327 | +# With a dumbbell plot, we use a circle to represent the data values, just like |
| 328 | +# the lollipop. Instead of having a line that extends all the way to the axis, |
| 329 | +# though, we use a line to connect the two dots in each category of Inclusiveness |
| 330 | +# Index. |
| 331 | +
|
| 332 | +ggplot( |
| 333 | + inclusiveness_index_factored_natural %>% |
| 334 | + dplyr::filter(Continent %in% c("Asia", "Europe")) %>% |
| 335 | + count(Continent, Index.categories.2020), |
| 336 | + mapping = aes(x = Index.categories.2020, y=n)) + |
| 337 | + geom_line(aes(group=Index.categories.2020), color="#555555", linewidth=1) + |
| 338 | + geom_point(aes(color = Continent), size=6) |
| 339 | +
|
| 340 | +``` |
| 341 | + |
0 commit comments