Skip to content

Consider using named fields for multi-key valence dictionaries #22

@kbenoit

Description

@kbenoit

ANEW for instance has three categories, but if the dictionary is not sliced for a single key, then all three are averaged, which is probably not what a user wants.

library("quanteda")
#> Package version: 3.2.5
#> Unicode version: 14.0
#> ICU version: 70.1
#> Parallel computing: 10 of 10 threads used.
#> See https://quanteda.io for tutorials and examples.
library("quanteda.sentiment")
#> 
#> Attaching package: 'quanteda.sentiment'
#> The following object is masked from 'package:quanteda':
#> 
#>     data_dictionary_LSD2015

toks <- tokens(tail(data_corpus_inaugural))

tstat1 <- textstat_valence(toks, dictionary = data_dictionary_ANEW)
names(tstat1)[2] <- "sentiment_entire"

# (virtually) same as:
data_dictionary_ANEW["all"] <- data_dictionary_ANEW["arousal"]
valence(data_dictionary_ANEW)["all"] <- 
  list(all = apply(as.data.frame(valence(data_dictionary_ANEW)), 1, mean))
tstat2 <- textstat_valence(toks, dictionary = data_dictionary_ANEW["all"])

df <- cbind(tstat1, data.frame(sentiment_averaged = tstat2$sentiment))
df
#>       doc_id sentiment_entire sentiment_averaged
#> 1  2001-Bush         5.614141           5.616636
#> 2  2005-Bush         5.757551           5.758452
#> 3 2009-Obama         5.460327           5.459256
#> 4 2013-Obama         5.606507           5.606507
#> 5 2017-Trump         5.701053           5.701393
#> 6 2021-Biden         5.629749           5.630278
cor.test(df$sentiment_entire, df$sentiment_averaged)
#> 
#>  Pearson's product-moment correlation
#> 
#> data:  df$sentiment_entire and df$sentiment_averaged
#> t = 197.75, df = 4, p-value = 3.923e-09
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#>  0.9995085 0.9999947
#> sample estimates:
#>       cor 
#> 0.9999489

# different from (e.g.):
textstat_valence(toks, dictionary = data_dictionary_ANEW["pleasure"])
#>       doc_id sentiment
#> 1  2001-Bush  6.091330
#> 2  2005-Bush  6.308839
#> 3 2009-Obama  5.841437
#> 4 2013-Obama  6.045129
#> 5 2017-Trump  6.223944
#> 6 2021-Biden  6.018528

# what we probably want
rm(data_dictionary_ANEW)
structure(data.frame(
  textstat_valence(toks, dictionary = data_dictionary_ANEW["pleasure"]),
  textstat_valence(toks, dictionary = data_dictionary_ANEW["arousal"])["sentiment"],
  textstat_valence(toks, dictionary = data_dictionary_ANEW["dominance"])["sentiment"]),
  names = c("doc_id", paste("sentiment", names(data_dictionary_ANEW), sep = "_"))
)
#>       doc_id sentiment_pleasure sentiment_arousal sentiment_dominance
#> 1  2001-Bush           6.091330          5.252523            5.506055
#> 2  2005-Bush           6.308839          5.315839            5.650677
#> 3 2009-Obama           5.841437          5.207523            5.328807
#> 4 2013-Obama           6.045129          5.342878            5.431513
#> 5 2017-Trump           6.223944          5.334413            5.545822
#> 6 2021-Biden           6.018528          5.327389            5.544917

Created on 2023-02-13 with reprex v2.0.2

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions