Merge pull request #47 from quanteda/fix-YulesK

kbenoit · web-flow · commit d2e7cd0c282a · 2021-11-10T10:53:48.000Z
Fix Yule's K computation
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: quanteda.textstats
-Version: 0.94.1.9000
+Version: 0.94.9000
 Title: Textual Statistics for the Quantitative Analysis of Textual Data
 Description: Textual statistics functions formerly in the 'quanteda' package.
     Textual statistics for characterizing and comparing textual data. Includes 
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,7 @@
-# quanteda.textstats 0.94.2
+# quanteda.textstats 0.95
 
 * Updated `textstat_simil()` for new **proxyC** version v0.2.2, which affects how similarities are returned for `NA` values.  See #45.
+* Fixed a bug in the computation of Yule's K (#46)
 
 # quanteda.textstats 0.94.1
 
diff --git a/R/textstat_lexdiv.R b/R/textstat_lexdiv.R
@@ -337,7 +337,7 @@ compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {
     }
 
     if ("K" %in% measure)
-        result[["K"]] <- 10 ^ 4 * vapply(ViN, function(y) sum(y$ViN * (y$i / y$n_tokens) ^ 2), numeric(1))
+        result[["K"]] <- 10 ^ 4 * vapply(ViN, function(y) (-1 / y$n_tokens[1]) + sum(y$ViN * (y$i / y$n_tokens) ^ 2), numeric(1))
     if ("I" %in% measure) {
         M_2 <- vapply(ViN, function(y) sum(y$ViN * y$i^2), numeric(1))
         M_1 <- n_types
diff --git a/tests/testthat/test-textstat_lexdiv.R b/tests/testthat/test-textstat_lexdiv.R
@@ -100,23 +100,46 @@ test_that("Yule's K and Herndon's Vm correction are (approximately) correct", {
     # work with chapter 1
     data_dfm_stjohnch1 <- dfm_subset(data_dfm_stjohn, chapter == 1)
 
-    expect_equal(
-        as.integer(ntoken(data_dfm_stjohnch1)), # 770
-        755L,     # from Miranda-Garcia and Calle-Martin (2005, Table 1)
-        tol = 15  # might differ b/c of different translations, spellings, or token-counting method
+    freqs <- data_dfm_stjohnch1 %>%
+        featfreq() %>%
+        head(n = 331) %>%
+        sort(decreasing = FALSE)
+    freqnames <- names(freqs)
+    # from Table 1
+    freqs <- c(rep(1, 212),
+           rep(2, 51),
+           rep(3, 26),
+           rep(4, 13),
+           rep(5, 6),
+           rep(6, 6),
+           rep(7, 3),
+           rep(8, 4),
+           rep(10, 1),
+           rep(11, 1),
+           rep(13, 3),
+           rep(16, 1),
+           rep(17, 1),
+           rep(19, 1),
+           rep(21, 1),
+           rep(59, 1))
+    names(freqs) <- freqnames
+    dfmat <- as.dfm(matrix(freqs, nrow = 1, dimnames = list(docnames(data_dfm_stjohnch1),
+                                                            freqnames)))
+    expect_identical(
+        as.integer(ntoken(dfmat)), # 770
+        755L     # from Miranda-Garcia and Calle-Martin (2005, Table 1)
     )
 
-    expect_equal(
-        as.integer(ntype(data_dfm_stjohnch1)),  # 329
-        331L,     # from Miranda-Garcia and Calle-Martin (2005, Table 1)
-        tol = 2   # might be off because of different translations or token-counting method
+    expect_identical(
+        as.integer(ntype(dfmat)),  # 329
+        331L     # from Miranda-Garcia and Calle-Martin (2005, Table 1)
     )
 
     expect_equivalent(
-        textstat_lexdiv(data_dfm_stjohnch1, "K"),  # 129.0943
+        textstat_lexdiv(dfmat, "K"),  # 112.767
         # from Miranda-Garcia and Calle-Martin (2005, Table 3)
-        data.frame(document = "chap1", K = 126.3366167, stringsAsFactors = FALSE),
-        tol = 3
+        data.frame(document = "chap1", K = 113.091583, stringsAsFactors = FALSE),
+        tolerance = 0.5
     )
 
     # tests on multiple documents - this is Ch 1 and Chs 1-4 as per the first two rows of
@@ -126,9 +149,9 @@ test_that("Yule's K and Herndon's Vm correction are (approximately) correct", {
     docnames(data_dfm_stjohncomb)[2] <- "chaps1-4"
     expect_equivalent(
         textstat_lexdiv(data_dfm_stjohncomb, "K"),
-        data.frame(document = c("chap1", "chaps1-4"), K = c(126.3366167, 99.43763148),
+        data.frame(document = c("chap1", "chaps1-4"), K = c(113.091583, 109.957455),
                    stringsAsFactors = FALSE),
-        tol = 3
+        tolerance = 1
     )
 
     # try also Herdan's Vm and Simpson's D - these are VERY WEAK tests
@@ -406,3 +429,16 @@ test_that("dfm_split_hyphenated_features works as expected", {
         c("one", "two", "three", ".", "-")
     )
 })
+
+test_that("Exact tests for Yule's K", {
+    txt <- c("a b c d d e e f f f",
+             "a b c d d e e f f f g g g g")
+    toks <- tokens(txt)
+    textstat_lexdiv(toks, "K")
+
+    # from koRpus and in issue #46
+    expect_equal(
+        round(textstat_lexdiv(toks, "K")$K, 3),
+        c(1000, 1122.449)
+    )
+})

Original file line number	Diff line number	Diff line change
`@@ -337,7 +337,7 @@ compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {`
`337`	`337`	`}`
`338`	`338`
`339`	`339`	`if ("K" %in% measure)`
`340`		`- result[["K"]] <- 10 ^ 4 * vapply(ViN, function(y) sum(y$ViN * (y$i / y$n_tokens) ^ 2), numeric(1))`
	`340`	`+ result[["K"]] <- 10 ^ 4 * vapply(ViN, function(y) (-1 / y$n_tokens[1]) + sum(y$ViN * (y$i / y$n_tokens) ^ 2), numeric(1))`
`341`	`341`	`if ("I" %in% measure) {`
`342`	`342`	`M_2 <- vapply(ViN, function(y) sum(y$ViN * y$i^2), numeric(1))`
`343`	`343`	`M_1 <- n_types`