@@ -100,23 +100,46 @@ test_that("Yule's K and Herndon's Vm correction are (approximately) correct", {
100100 # work with chapter 1
101101 data_dfm_stjohnch1 <- dfm_subset(data_dfm_stjohn , chapter == 1 )
102102
103- expect_equal(
104- as.integer(ntoken(data_dfm_stjohnch1 )), # 770
105- 755L , # from Miranda-Garcia and Calle-Martin (2005, Table 1)
106- tol = 15 # might differ b/c of different translations, spellings, or token-counting method
103+ freqs <- data_dfm_stjohnch1 %> %
104+ featfreq() %> %
105+ head(n = 331 ) %> %
106+ sort(decreasing = FALSE )
107+ freqnames <- names(freqs )
108+ # from Table 1
109+ freqs <- c(rep(1 , 212 ),
110+ rep(2 , 51 ),
111+ rep(3 , 26 ),
112+ rep(4 , 13 ),
113+ rep(5 , 6 ),
114+ rep(6 , 6 ),
115+ rep(7 , 3 ),
116+ rep(8 , 4 ),
117+ rep(10 , 1 ),
118+ rep(11 , 1 ),
119+ rep(13 , 3 ),
120+ rep(16 , 1 ),
121+ rep(17 , 1 ),
122+ rep(19 , 1 ),
123+ rep(21 , 1 ),
124+ rep(59 , 1 ))
125+ names(freqs ) <- freqnames
126+ dfmat <- as.dfm(matrix (freqs , nrow = 1 , dimnames = list (docnames(data_dfm_stjohnch1 ),
127+ freqnames )))
128+ expect_identical(
129+ as.integer(ntoken(dfmat )), # 770
130+ 755L # from Miranda-Garcia and Calle-Martin (2005, Table 1)
107131 )
108132
109- expect_equal(
110- as.integer(ntype(data_dfm_stjohnch1 )), # 329
111- 331L , # from Miranda-Garcia and Calle-Martin (2005, Table 1)
112- tol = 2 # might be off because of different translations or token-counting method
133+ expect_identical(
134+ as.integer(ntype(dfmat )), # 329
135+ 331L # from Miranda-Garcia and Calle-Martin (2005, Table 1)
113136 )
114137
115138 expect_equivalent(
116- textstat_lexdiv(data_dfm_stjohnch1 , " K" ), # 129.0943
139+ textstat_lexdiv(dfmat , " K" ), # 112.767
117140 # from Miranda-Garcia and Calle-Martin (2005, Table 3)
118- data.frame (document = " chap1" , K = 126.3366167 , stringsAsFactors = FALSE ),
119- tol = 3
141+ data.frame (document = " chap1" , K = 113.091583 , stringsAsFactors = FALSE ),
142+ tolerance = 0.5
120143 )
121144
122145 # tests on multiple documents - this is Ch 1 and Chs 1-4 as per the first two rows of
@@ -126,9 +149,9 @@ test_that("Yule's K and Herndon's Vm correction are (approximately) correct", {
126149 docnames(data_dfm_stjohncomb )[2 ] <- " chaps1-4"
127150 expect_equivalent(
128151 textstat_lexdiv(data_dfm_stjohncomb , " K" ),
129- data.frame (document = c(" chap1" , " chaps1-4" ), K = c(126.3366167 , 99.43763148 ),
152+ data.frame (document = c(" chap1" , " chaps1-4" ), K = c(113.091583 , 109.957455 ),
130153 stringsAsFactors = FALSE ),
131- tol = 3
154+ tolerance = 1
132155 )
133156
134157 # try also Herdan's Vm and Simpson's D - these are VERY WEAK tests
@@ -406,3 +429,16 @@ test_that("dfm_split_hyphenated_features works as expected", {
406429 c(" one" , " two" , " three" , " ." , " -" )
407430 )
408431})
432+
433+ test_that(" Exact tests for Yule's K" , {
434+ txt <- c(" a b c d d e e f f f" ,
435+ " a b c d d e e f f f g g g g" )
436+ toks <- tokens(txt )
437+ textstat_lexdiv(toks , " K" )
438+
439+ # from koRpus and in issue #46
440+ expect_equal(
441+ round(textstat_lexdiv(toks , " K" )$ K , 3 ),
442+ c(1000 , 1122.449 )
443+ )
444+ })
0 commit comments