-
Notifications
You must be signed in to change notification settings - Fork 5
Open
Description
ANEW for instance has three categories, but if the dictionary is not sliced for a single key, then all three are averaged, which is probably not what a user wants.
library("quanteda")
#> Package version: 3.2.5
#> Unicode version: 14.0
#> ICU version: 70.1
#> Parallel computing: 10 of 10 threads used.
#> See https://quanteda.io for tutorials and examples.
library("quanteda.sentiment")
#>
#> Attaching package: 'quanteda.sentiment'
#> The following object is masked from 'package:quanteda':
#>
#> data_dictionary_LSD2015
toks <- tokens(tail(data_corpus_inaugural))
tstat1 <- textstat_valence(toks, dictionary = data_dictionary_ANEW)
names(tstat1)[2] <- "sentiment_entire"
# (virtually) same as:
data_dictionary_ANEW["all"] <- data_dictionary_ANEW["arousal"]
valence(data_dictionary_ANEW)["all"] <-
list(all = apply(as.data.frame(valence(data_dictionary_ANEW)), 1, mean))
tstat2 <- textstat_valence(toks, dictionary = data_dictionary_ANEW["all"])
df <- cbind(tstat1, data.frame(sentiment_averaged = tstat2$sentiment))
df
#> doc_id sentiment_entire sentiment_averaged
#> 1 2001-Bush 5.614141 5.616636
#> 2 2005-Bush 5.757551 5.758452
#> 3 2009-Obama 5.460327 5.459256
#> 4 2013-Obama 5.606507 5.606507
#> 5 2017-Trump 5.701053 5.701393
#> 6 2021-Biden 5.629749 5.630278
cor.test(df$sentiment_entire, df$sentiment_averaged)
#>
#> Pearson's product-moment correlation
#>
#> data: df$sentiment_entire and df$sentiment_averaged
#> t = 197.75, df = 4, p-value = 3.923e-09
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#> 0.9995085 0.9999947
#> sample estimates:
#> cor
#> 0.9999489
# different from (e.g.):
textstat_valence(toks, dictionary = data_dictionary_ANEW["pleasure"])
#> doc_id sentiment
#> 1 2001-Bush 6.091330
#> 2 2005-Bush 6.308839
#> 3 2009-Obama 5.841437
#> 4 2013-Obama 6.045129
#> 5 2017-Trump 6.223944
#> 6 2021-Biden 6.018528
# what we probably want
rm(data_dictionary_ANEW)
structure(data.frame(
textstat_valence(toks, dictionary = data_dictionary_ANEW["pleasure"]),
textstat_valence(toks, dictionary = data_dictionary_ANEW["arousal"])["sentiment"],
textstat_valence(toks, dictionary = data_dictionary_ANEW["dominance"])["sentiment"]),
names = c("doc_id", paste("sentiment", names(data_dictionary_ANEW), sep = "_"))
)
#> doc_id sentiment_pleasure sentiment_arousal sentiment_dominance
#> 1 2001-Bush 6.091330 5.252523 5.506055
#> 2 2005-Bush 6.308839 5.315839 5.650677
#> 3 2009-Obama 5.841437 5.207523 5.328807
#> 4 2013-Obama 6.045129 5.342878 5.431513
#> 5 2017-Trump 6.223944 5.334413 5.545822
#> 6 2021-Biden 6.018528 5.327389 5.544917Created on 2023-02-13 with reprex v2.0.2
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels