Skip to content

Commit d2e7cd0

Browse files
authored
Merge pull request #47 from quanteda/fix-YulesK
Fix Yule's K computation
2 parents a9bd8f7 + b8ed442 commit d2e7cd0

File tree

4 files changed

+53
-16
lines changed

4 files changed

+53
-16
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Package: quanteda.textstats
2-
Version: 0.94.1.9000
2+
Version: 0.94.9000
33
Title: Textual Statistics for the Quantitative Analysis of Textual Data
44
Description: Textual statistics functions formerly in the 'quanteda' package.
55
Textual statistics for characterizing and comparing textual data. Includes

NEWS.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
# quanteda.textstats 0.94.2
1+
# quanteda.textstats 0.95
22

33
* Updated `textstat_simil()` for new **proxyC** version v0.2.2, which affects how similarities are returned for `NA` values. See #45.
4+
* Fixed a bug in the computation of Yule's K (#46)
45

56
# quanteda.textstats 0.94.1
67

R/textstat_lexdiv.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {
337337
}
338338

339339
if ("K" %in% measure)
340-
result[["K"]] <- 10 ^ 4 * vapply(ViN, function(y) sum(y$ViN * (y$i / y$n_tokens) ^ 2), numeric(1))
340+
result[["K"]] <- 10 ^ 4 * vapply(ViN, function(y) (-1 / y$n_tokens[1]) + sum(y$ViN * (y$i / y$n_tokens) ^ 2), numeric(1))
341341
if ("I" %in% measure) {
342342
M_2 <- vapply(ViN, function(y) sum(y$ViN * y$i^2), numeric(1))
343343
M_1 <- n_types

tests/testthat/test-textstat_lexdiv.R

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -100,23 +100,46 @@ test_that("Yule's K and Herndon's Vm correction are (approximately) correct", {
100100
# work with chapter 1
101101
data_dfm_stjohnch1 <- dfm_subset(data_dfm_stjohn, chapter == 1)
102102

103-
expect_equal(
104-
as.integer(ntoken(data_dfm_stjohnch1)), # 770
105-
755L, # from Miranda-Garcia and Calle-Martin (2005, Table 1)
106-
tol = 15 # might differ b/c of different translations, spellings, or token-counting method
103+
freqs <- data_dfm_stjohnch1 %>%
104+
featfreq() %>%
105+
head(n = 331) %>%
106+
sort(decreasing = FALSE)
107+
freqnames <- names(freqs)
108+
# from Table 1
109+
freqs <- c(rep(1, 212),
110+
rep(2, 51),
111+
rep(3, 26),
112+
rep(4, 13),
113+
rep(5, 6),
114+
rep(6, 6),
115+
rep(7, 3),
116+
rep(8, 4),
117+
rep(10, 1),
118+
rep(11, 1),
119+
rep(13, 3),
120+
rep(16, 1),
121+
rep(17, 1),
122+
rep(19, 1),
123+
rep(21, 1),
124+
rep(59, 1))
125+
names(freqs) <- freqnames
126+
dfmat <- as.dfm(matrix(freqs, nrow = 1, dimnames = list(docnames(data_dfm_stjohnch1),
127+
freqnames)))
128+
expect_identical(
129+
as.integer(ntoken(dfmat)), # 770
130+
755L # from Miranda-Garcia and Calle-Martin (2005, Table 1)
107131
)
108132

109-
expect_equal(
110-
as.integer(ntype(data_dfm_stjohnch1)), # 329
111-
331L, # from Miranda-Garcia and Calle-Martin (2005, Table 1)
112-
tol = 2 # might be off because of different translations or token-counting method
133+
expect_identical(
134+
as.integer(ntype(dfmat)), # 329
135+
331L # from Miranda-Garcia and Calle-Martin (2005, Table 1)
113136
)
114137

115138
expect_equivalent(
116-
textstat_lexdiv(data_dfm_stjohnch1, "K"), # 129.0943
139+
textstat_lexdiv(dfmat, "K"), # 112.767
117140
# from Miranda-Garcia and Calle-Martin (2005, Table 3)
118-
data.frame(document = "chap1", K = 126.3366167, stringsAsFactors = FALSE),
119-
tol = 3
141+
data.frame(document = "chap1", K = 113.091583, stringsAsFactors = FALSE),
142+
tolerance = 0.5
120143
)
121144

122145
# tests on multiple documents - this is Ch 1 and Chs 1-4 as per the first two rows of
@@ -126,9 +149,9 @@ test_that("Yule's K and Herndon's Vm correction are (approximately) correct", {
126149
docnames(data_dfm_stjohncomb)[2] <- "chaps1-4"
127150
expect_equivalent(
128151
textstat_lexdiv(data_dfm_stjohncomb, "K"),
129-
data.frame(document = c("chap1", "chaps1-4"), K = c(126.3366167, 99.43763148),
152+
data.frame(document = c("chap1", "chaps1-4"), K = c(113.091583, 109.957455),
130153
stringsAsFactors = FALSE),
131-
tol = 3
154+
tolerance = 1
132155
)
133156

134157
# try also Herdan's Vm and Simpson's D - these are VERY WEAK tests
@@ -406,3 +429,16 @@ test_that("dfm_split_hyphenated_features works as expected", {
406429
c("one", "two", "three", ".", "-")
407430
)
408431
})
432+
433+
test_that("Exact tests for Yule's K", {
434+
txt <- c("a b c d d e e f f f",
435+
"a b c d d e e f f f g g g g")
436+
toks <- tokens(txt)
437+
textstat_lexdiv(toks, "K")
438+
439+
# from koRpus and in issue #46
440+
expect_equal(
441+
round(textstat_lexdiv(toks, "K")$K, 3),
442+
c(1000, 1122.449)
443+
)
444+
})

0 commit comments

Comments
 (0)