|
| 1 | +--- |
| 2 | +title: "Text Analysis Scripts" |
| 3 | +format: html |
| 4 | +editor: visual |
| 5 | +--- |
| 6 | + |
| 7 | +Required Packages |
| 8 | + |
| 9 | +```{r} |
| 10 | +#Install |
| 11 | +install.packages("RColorBrewer") #Part I |
| 12 | +install.packages("tidyverse") #Parts I and II |
| 13 | +install.packages("tidytext") #Part II |
| 14 | +install.packages("wordcloud") #Part I |
| 15 | +install.packages("wordcloud2") #Part I |
| 16 | +install.packages("sentimentr") #Part III |
| 17 | +install.packages("syuzhet") #Part III |
| 18 | +``` |
| 19 | + |
| 20 | +```{r} |
| 21 | +#Load |
| 22 | +library(dplyr) |
| 23 | +library(ggplot2) |
| 24 | +library(RColorBrewer) |
| 25 | +library(stringr) |
| 26 | +library(tidytext) |
| 27 | +library(tidyverse) |
| 28 | +library(wordcloud) |
| 29 | +library(wordcloud2) |
| 30 | +library(sentimentr) |
| 31 | +library(syuzhet) |
| 32 | +``` |
| 33 | + |
| 34 | +## Part 0 - Get the Dataset |
| 35 | + |
| 36 | +```{r} |
| 37 | +#Load Data |
| 38 | +comments <- readr::read_csv("./data/comments_preprocessed.csv") |
| 39 | +
|
| 40 | +#Check first rows |
| 41 | +head(comments) |
| 42 | +``` |
| 43 | + |
| 44 | +## Part I - Word Frequencies |
| 45 | + |
| 46 | +#### 1.1 Transform sentence strings into tokens (words) and n-grams |
| 47 | + |
| 48 | +```{r} |
| 49 | +#Applying Tokenization |
| 50 | +tokens <- comments %>% |
| 51 | + unnest_tokens(word, comments) |
| 52 | +
|
| 53 | +#We might also convert sentences into n-grams (consecutive words) |
| 54 | +ngrams <- comments %>% |
| 55 | + unnest_tokens(ngrams, comments, token = "ngrams", n = 2) #bigrams |
| 56 | +
|
| 57 | +``` |
| 58 | + |
| 59 | +#### 1.2 Count frequencies |
| 60 | + |
| 61 | +```{r} |
| 62 | +#Count word freq |
| 63 | +word_freq <- tokens %>% |
| 64 | + count(id, word, sort = TRUE) |
| 65 | +
|
| 66 | +#Count bigram freq |
| 67 | +bigram_freq <- ngrams %>% |
| 68 | + count(id, ngrams, sort = TRUE) |
| 69 | +
|
| 70 | +#2nd round |
| 71 | +
|
| 72 | +#Filtering words |
| 73 | +word_freq_filtered <- word_freq %>% |
| 74 | + filter(!word %in% c("severance", "season", "appleTV", "apple", "tv", "show", "finale", "episode")) #unwanted words |
| 75 | +``` |
| 76 | + |
| 77 | +#### 1.3 Word Clouds |
| 78 | + |
| 79 | +```{r} |
| 80 | +#Basic Word Cloud (tokens) |
| 81 | +wordcloud_tokens <- tokens %>% |
| 82 | + count(word, sort = TRUE) |
| 83 | +
|
| 84 | +wordcloud2(wordcloud_tokens, size = 1.5, color = "random-light", backgroundColor = "black") |
| 85 | +
|
| 86 | +``` |
| 87 | + |
| 88 | +#### 1.4 Filtered Wordclouds |
| 89 | + |
| 90 | +```{r} |
| 91 | +##Only words with at least 5 mentions |
| 92 | +wordcloud_filtered <- word_freq_filtered %>% |
| 93 | + count(word, sort = TRUE) %>% |
| 94 | + filter(n >= 5) %>% # example threshold |
| 95 | + filter(!word %in% c("severance", "season", "appleTV", "apple", "tv", "show", "finale", "episode")) #filter unwanted words |
| 96 | +
|
| 97 | +wordcloud2(wordcloud_filtered, size = 1.5, color = "random-light", backgroundColor = "black") |
| 98 | +
|
| 99 | +
|
| 100 | +#Subset by seasons |
| 101 | +#Only "s1" IDs |
| 102 | +wordcloud_filtered_s1 <- word_freq_filtered %>% |
| 103 | + filter(grepl("^s1", id)) %>% # filter by id first |
| 104 | + count(word, sort = TRUE) %>% |
| 105 | + filter(n >= 5) %>% |
| 106 | + filter(!word %in% c("severance", "season", "appleTV", "apple", "tv", "show", "finale", "episode")) |
| 107 | +
|
| 108 | +wordcloud2(wordcloud_filtered_s1, size = 1.5, color = "random-light", backgroundColor = "black") |
| 109 | +
|
| 110 | +
|
| 111 | +#Only "s2" IDs |
| 112 | +wordcloud_filtered_s2 <- word_freq_filtered %>% |
| 113 | + filter(grepl("^s2", id)) %>% # filter by id first |
| 114 | + count(word, sort = TRUE) %>% |
| 115 | + filter(n >= 5) %>% |
| 116 | + filter(!word %in% c("severance", "season", "appleTV", "apple", "tv", "show", "finale", "episode")) |
| 117 | +
|
| 118 | +wordcloud2(wordcloud_filtered_s2, size = 1.5, color = "random-light", backgroundColor = "black") |
| 119 | +``` |
| 120 | + |
| 121 | +## Part II - TF-IDF |
| 122 | + |
| 123 | +Q: Challenge per document, not informative. So perhaps season makes more sense? |
| 124 | + |
| 125 | +```{r} |
| 126 | +tf_idf <- tokens %>% |
| 127 | + count(id, word, sort = TRUE) %>% |
| 128 | + bind_tf_idf(word, id, n) %>% |
| 129 | + arrange(desc(tf_idf)) #Sort by TF-IDF weight |
| 130 | +``` |
| 131 | + |
| 132 | +## Part III - Sentiment Analysis |
| 133 | + |
| 134 | +#### Polarity with Sentiment R (Valence Sifters Capability) |
| 135 | + |
| 136 | +```{r} |
| 137 | +
|
| 138 | +# Compute sentiment per row/case |
| 139 | +sentiment_scores <- sentiment_by(comments$comments) |
| 140 | +
|
| 141 | +# Add scores and labels to original dataset |
| 142 | +polarity <- comments %>% |
| 143 | + mutate(score = sentiment_scores$ave_sentiment, |
| 144 | + sentiment_label = case_when( |
| 145 | + score > 0.1 ~ "positive", |
| 146 | + score < -0.1 ~ "negative", |
| 147 | + TRUE ~ "neutral" |
| 148 | + )) |
| 149 | +
|
| 150 | +# Check first rows with results |
| 151 | +head(polarity) |
| 152 | +
|
| 153 | +# Scores per label |
| 154 | +table(polarity$sentiment_label) |
| 155 | +
|
| 156 | +# Visualize |
| 157 | +ggplot(polarity, aes(x = score)) + |
| 158 | + geom_histogram(binwidth = 0.1, fill = "skyblue", color = "white") + |
| 159 | + theme_minimal() + |
| 160 | + labs(title = "Sentiment Score Distribution", x = "Average Sentiment", y = "Count") |
| 161 | +
|
| 162 | +# Extract season info (s1, s2) into a new column |
| 163 | +polarity_seasons <- mutate(polarity, |
| 164 | + season = str_extract(id, "s\\d+")) |
| 165 | +
|
| 166 | +# Histogram comparison by season |
| 167 | +ggplot(polarity_seasons, aes(x = score, fill = season)) + |
| 168 | + geom_histogram(binwidth = 0.1, position = "dodge", color = "white") + |
| 169 | + theme_minimal() + |
| 170 | + labs(title = "Sentiment Score Distribution by Season", |
| 171 | + x = "Average Sentiment", y = "Count") + |
| 172 | + scale_fill_brewer(palette = "Set1") |
| 173 | +
|
| 174 | +# Save results |
| 175 | +write_csv(polarity, "output/polarity_results.csv") |
| 176 | +``` |
| 177 | + |
| 178 | +#### Emotion Detection with Syuzhet's NRC Lexicon |
| 179 | + |
| 180 | +```{r} |
| 181 | +#Set emotion analysis by sentence |
| 182 | +sentences <- get_sentences(comments$comments) |
| 183 | +
|
| 184 | +#Compute NRC sentiment per sentence |
| 185 | +emotion_score <- get_nrc_sentiment(sentences) |
| 186 | +
|
| 187 | +#Summary Outout for each emotion |
| 188 | +summary(emotion_score) |
| 189 | +
|
| 190 | +#Grouping so ID to match sentences to their original row |
| 191 | +comments$comments <- sentences |
| 192 | +emotion_data <- bind_cols(comments, emotion_score) |
| 193 | +
|
| 194 | +#Summarize counts for each emotion |
| 195 | +emotion_summary <- emotion_data %>% |
| 196 | + select(anger:trust) %>% # only emotion columns |
| 197 | + summarise(across(everything(), sum)) %>% |
| 198 | + pivot_longer(cols = everything(), names_to = "emotion", values_to = "count") %>% |
| 199 | + arrange(desc(count)) # sort by count |
| 200 | +
|
| 201 | +# Plot overall emotion |
| 202 | +ggplot(emotion_summary, aes(x = emotion, y = count, fill = emotion)) + |
| 203 | + geom_col(show.legend = FALSE) + |
| 204 | + geom_text(aes(label = count), |
| 205 | + hjust = -0.2, # position slightly outside the bar |
| 206 | + size = 2) + # font size |
| 207 | + scale_fill_manual(values = brewer.pal(10, "Paired")) + # we need more than 8 colors |
| 208 | + theme_minimal(base_size = 12) + |
| 209 | + labs(title = "Overall Emotion Distribution", |
| 210 | + x = "Emotion", |
| 211 | + y = "Total Count") + |
| 212 | + coord_flip() |
| 213 | +
|
| 214 | +# Summary by season |
| 215 | +# Create season column |
| 216 | +emotion_seasons <- emotion_data %>% |
| 217 | + mutate(season = ifelse(grepl("^s1_", id), "s1", |
| 218 | + ifelse(grepl("^s2_", id), "s2", NA))) |
| 219 | +
|
| 220 | +# Summarize emotions per season |
| 221 | +emotion_by_season <- emotion_seasons %>% |
| 222 | + group_by(season) %>% |
| 223 | + summarise(across(anger:positive, sum, na.rm = TRUE)) |
| 224 | +
|
| 225 | +emotion_by_season |
| 226 | +
|
| 227 | +# Convert to long format for plotting |
| 228 | +emotion_long <- emotion_by_season %>% |
| 229 | + pivot_longer( |
| 230 | + cols = anger:positive, |
| 231 | + names_to = "emotion", |
| 232 | + values_to = "count" |
| 233 | + ) |
| 234 | +
|
| 235 | +# Plot horizontal bars to compare seasons |
| 236 | +ggplot(emotion_long, |
| 237 | + aes(x = reorder(emotion, -count), y = count, fill = season)) + |
| 238 | + geom_col(position = "dodge") + |
| 239 | + geom_text(aes(label = count), |
| 240 | + hjust = -0.2, # position slightly outside the bar |
| 241 | + size = 2) + # font size |
| 242 | + scale_fill_brewer(palette = "Set2") + |
| 243 | + theme_minimal(base_size = 12) + |
| 244 | + labs( |
| 245 | + title = "Emotion Distribution by Season", |
| 246 | + x = "Emotion", |
| 247 | + y = "Total Count", |
| 248 | + fill = "Season" |
| 249 | + ) + |
| 250 | + coord_flip() |
| 251 | +
|
| 252 | +# Co-occurence Heatmap |
| 253 | +emotion_matrix <- emotion_data %>% |
| 254 | + select(anger:trust) # keeping only emotion columns; removing positive and negative |
| 255 | +
|
| 256 | +# Compute co-occurrence matrix |
| 257 | +co_occurrence <- cor(emotion_matrix, method = "pearson") # values from -1 to 1 |
| 258 | +
|
| 259 | +# Remove diagonal color |
| 260 | +diag(co_occurrence) <- NA |
| 261 | +
|
| 262 | +# Convert to long format for ggplot |
| 263 | +co_occurrence_long <- as.data.frame(as.table(co_occurrence)) |
| 264 | +colnames(co_occurrence_long) <- c("emotion1", "emotion2", "correlation") |
| 265 | +
|
| 266 | +# Plot heatmap |
| 267 | +ggplot(co_occurrence_long, aes(x = emotion1, y = emotion2, fill = correlation)) + |
| 268 | + geom_tile(color = "white") + |
| 269 | + scale_fill_gradient2(mid = "white", high = "red", midpoint = 0, |
| 270 | + limits = c(0,1), na.value = "grey95", name = "Correlation") + |
| 271 | + theme_minimal(base_size = 12) + |
| 272 | + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + |
| 273 | + labs(title = "Emotion Co-occurrence Heatmap", |
| 274 | + x = "Emotion", |
| 275 | + y = "Emotion") |
| 276 | + |
| 277 | +# Save results |
| 278 | +write_csv(emotion_data, "output/sentiment_emotion_results.csv") |
| 279 | +``` |
0 commit comments