Skip to content

Commit fd1364b

Browse files
authored
adding scripts
1 parent e706efc commit fd1364b

File tree

1 file changed

+279
-0
lines changed

1 file changed

+279
-0
lines changed

scripts/scripts_textanalysis.qmd

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
---
2+
title: "Text Analysis Scripts"
3+
format: html
4+
editor: visual
5+
---
6+
7+
Required Packages
8+
9+
```{r}
10+
#Install
11+
install.packages("RColorBrewer") #Part I
12+
install.packages("tidyverse") #Parts I and II
13+
install.packages("tidytext") #Part II
14+
install.packages("wordcloud") #Part I
15+
install.packages("wordcloud2") #Part I
16+
install.packages("sentimentr") #Part III
17+
install.packages("syuzhet") #Part III
18+
```
19+
20+
```{r}
21+
#Load
22+
library(dplyr)
23+
library(ggplot2)
24+
library(RColorBrewer)
25+
library(stringr)
26+
library(tidytext)
27+
library(tidyverse)
28+
library(wordcloud)
29+
library(wordcloud2)
30+
library(sentimentr)
31+
library(syuzhet)
32+
```
33+
34+
## Part 0 - Get the Dataset
35+
36+
```{r}
37+
#Load Data
38+
comments <- readr::read_csv("./data/comments_preprocessed.csv")
39+
40+
#Check first rows
41+
head(comments)
42+
```
43+
44+
## Part I - Word Frequencies
45+
46+
#### 1.1 Transform sentence strings into tokens (words) and n-grams
47+
48+
```{r}
49+
#Applying Tokenization
50+
tokens <- comments %>%
51+
unnest_tokens(word, comments)
52+
53+
#We might also convert sentences into n-grams (consecutive words)
54+
ngrams <- comments %>%
55+
unnest_tokens(ngrams, comments, token = "ngrams", n = 2) #bigrams
56+
57+
```
58+
59+
#### 1.2 Count frequencies
60+
61+
```{r}
62+
#Count word freq
63+
word_freq <- tokens %>%
64+
count(id, word, sort = TRUE)
65+
66+
#Count bigram freq
67+
bigram_freq <- ngrams %>%
68+
count(id, ngrams, sort = TRUE)
69+
70+
#2nd round
71+
72+
#Filtering words
73+
word_freq_filtered <- word_freq %>%
74+
filter(!word %in% c("severance", "season", "appleTV", "apple", "tv", "show", "finale", "episode")) #unwanted words
75+
```
76+
77+
#### 1.3 Word Clouds
78+
79+
```{r}
80+
#Basic Word Cloud (tokens)
81+
wordcloud_tokens <- tokens %>%
82+
count(word, sort = TRUE)
83+
84+
wordcloud2(wordcloud_tokens, size = 1.5, color = "random-light", backgroundColor = "black")
85+
86+
```
87+
88+
#### 1.4 Filtered Wordclouds
89+
90+
```{r}
91+
##Only words with at least 5 mentions
92+
wordcloud_filtered <- word_freq_filtered %>%
93+
count(word, sort = TRUE) %>%
94+
filter(n >= 5) %>% # example threshold
95+
filter(!word %in% c("severance", "season", "appleTV", "apple", "tv", "show", "finale", "episode")) #filter unwanted words
96+
97+
wordcloud2(wordcloud_filtered, size = 1.5, color = "random-light", backgroundColor = "black")
98+
99+
100+
#Subset by seasons
101+
#Only "s1" IDs
102+
wordcloud_filtered_s1 <- word_freq_filtered %>%
103+
filter(grepl("^s1", id)) %>% # filter by id first
104+
count(word, sort = TRUE) %>%
105+
filter(n >= 5) %>%
106+
filter(!word %in% c("severance", "season", "appleTV", "apple", "tv", "show", "finale", "episode"))
107+
108+
wordcloud2(wordcloud_filtered_s1, size = 1.5, color = "random-light", backgroundColor = "black")
109+
110+
111+
#Only "s2" IDs
112+
wordcloud_filtered_s2 <- word_freq_filtered %>%
113+
filter(grepl("^s2", id)) %>% # filter by id first
114+
count(word, sort = TRUE) %>%
115+
filter(n >= 5) %>%
116+
filter(!word %in% c("severance", "season", "appleTV", "apple", "tv", "show", "finale", "episode"))
117+
118+
wordcloud2(wordcloud_filtered_s2, size = 1.5, color = "random-light", backgroundColor = "black")
119+
```
120+
121+
## Part II - TF-IDF
122+
123+
Q: Challenge per document, not informative. So perhaps season makes more sense?
124+
125+
```{r}
126+
tf_idf <- tokens %>%
127+
count(id, word, sort = TRUE) %>%
128+
bind_tf_idf(word, id, n) %>%
129+
arrange(desc(tf_idf)) #Sort by TF-IDF weight
130+
```
131+
132+
## Part III - Sentiment Analysis
133+
134+
#### Polarity with Sentiment R (Valence Sifters Capability)
135+
136+
```{r}
137+
138+
# Compute sentiment per row/case
139+
sentiment_scores <- sentiment_by(comments$comments)
140+
141+
# Add scores and labels to original dataset
142+
polarity <- comments %>%
143+
mutate(score = sentiment_scores$ave_sentiment,
144+
sentiment_label = case_when(
145+
score > 0.1 ~ "positive",
146+
score < -0.1 ~ "negative",
147+
TRUE ~ "neutral"
148+
))
149+
150+
# Check first rows with results
151+
head(polarity)
152+
153+
# Scores per label
154+
table(polarity$sentiment_label)
155+
156+
# Visualize
157+
ggplot(polarity, aes(x = score)) +
158+
geom_histogram(binwidth = 0.1, fill = "skyblue", color = "white") +
159+
theme_minimal() +
160+
labs(title = "Sentiment Score Distribution", x = "Average Sentiment", y = "Count")
161+
162+
# Extract season info (s1, s2) into a new column
163+
polarity_seasons <- mutate(polarity,
164+
season = str_extract(id, "s\\d+"))
165+
166+
# Histogram comparison by season
167+
ggplot(polarity_seasons, aes(x = score, fill = season)) +
168+
geom_histogram(binwidth = 0.1, position = "dodge", color = "white") +
169+
theme_minimal() +
170+
labs(title = "Sentiment Score Distribution by Season",
171+
x = "Average Sentiment", y = "Count") +
172+
scale_fill_brewer(palette = "Set1")
173+
174+
# Save results
175+
write_csv(polarity, "output/polarity_results.csv")
176+
```
177+
178+
#### Emotion Detection with Syuzhet's NRC Lexicon
179+
180+
```{r}
181+
#Set emotion analysis by sentence
182+
sentences <- get_sentences(comments$comments)
183+
184+
#Compute NRC sentiment per sentence
185+
emotion_score <- get_nrc_sentiment(sentences)
186+
187+
#Summary Outout for each emotion
188+
summary(emotion_score)
189+
190+
#Grouping so ID to match sentences to their original row
191+
comments$comments <- sentences
192+
emotion_data <- bind_cols(comments, emotion_score)
193+
194+
#Summarize counts for each emotion
195+
emotion_summary <- emotion_data %>%
196+
select(anger:trust) %>% # only emotion columns
197+
summarise(across(everything(), sum)) %>%
198+
pivot_longer(cols = everything(), names_to = "emotion", values_to = "count") %>%
199+
arrange(desc(count)) # sort by count
200+
201+
# Plot overall emotion
202+
ggplot(emotion_summary, aes(x = emotion, y = count, fill = emotion)) +
203+
geom_col(show.legend = FALSE) +
204+
geom_text(aes(label = count),
205+
hjust = -0.2, # position slightly outside the bar
206+
size = 2) + # font size
207+
scale_fill_manual(values = brewer.pal(10, "Paired")) + # we need more than 8 colors
208+
theme_minimal(base_size = 12) +
209+
labs(title = "Overall Emotion Distribution",
210+
x = "Emotion",
211+
y = "Total Count") +
212+
coord_flip()
213+
214+
# Summary by season
215+
# Create season column
216+
emotion_seasons <- emotion_data %>%
217+
mutate(season = ifelse(grepl("^s1_", id), "s1",
218+
ifelse(grepl("^s2_", id), "s2", NA)))
219+
220+
# Summarize emotions per season
221+
emotion_by_season <- emotion_seasons %>%
222+
group_by(season) %>%
223+
summarise(across(anger:positive, sum, na.rm = TRUE))
224+
225+
emotion_by_season
226+
227+
# Convert to long format for plotting
228+
emotion_long <- emotion_by_season %>%
229+
pivot_longer(
230+
cols = anger:positive,
231+
names_to = "emotion",
232+
values_to = "count"
233+
)
234+
235+
# Plot horizontal bars to compare seasons
236+
ggplot(emotion_long,
237+
aes(x = reorder(emotion, -count), y = count, fill = season)) +
238+
geom_col(position = "dodge") +
239+
geom_text(aes(label = count),
240+
hjust = -0.2, # position slightly outside the bar
241+
size = 2) + # font size
242+
scale_fill_brewer(palette = "Set2") +
243+
theme_minimal(base_size = 12) +
244+
labs(
245+
title = "Emotion Distribution by Season",
246+
x = "Emotion",
247+
y = "Total Count",
248+
fill = "Season"
249+
) +
250+
coord_flip()
251+
252+
# Co-occurence Heatmap
253+
emotion_matrix <- emotion_data %>%
254+
select(anger:trust) # keeping only emotion columns; removing positive and negative
255+
256+
# Compute co-occurrence matrix
257+
co_occurrence <- cor(emotion_matrix, method = "pearson") # values from -1 to 1
258+
259+
# Remove diagonal color
260+
diag(co_occurrence) <- NA
261+
262+
# Convert to long format for ggplot
263+
co_occurrence_long <- as.data.frame(as.table(co_occurrence))
264+
colnames(co_occurrence_long) <- c("emotion1", "emotion2", "correlation")
265+
266+
# Plot heatmap
267+
ggplot(co_occurrence_long, aes(x = emotion1, y = emotion2, fill = correlation)) +
268+
geom_tile(color = "white") +
269+
scale_fill_gradient2(mid = "white", high = "red", midpoint = 0,
270+
limits = c(0,1), na.value = "grey95", name = "Correlation") +
271+
theme_minimal(base_size = 12) +
272+
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
273+
labs(title = "Emotion Co-occurrence Heatmap",
274+
x = "Emotion",
275+
y = "Emotion")
276+
277+
# Save results
278+
write_csv(emotion_data, "output/sentiment_emotion_results.csv")
279+
```

0 commit comments

Comments
 (0)