Skip to content

Commit bf0b8ab

Browse files
authored
Merge pull request #48 from quanteda/fix-hamman
Change hamman to hamann
2 parents d2e7cd0 + 2e17c82 commit bf0b8ab

File tree

6 files changed

+33
-17
lines changed

6 files changed

+33
-17
lines changed

R/textstat_readability.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,7 @@ textstat_readability.corpus <- function(x,
662662
result[["ELF"]] <- W2Sy / St
663663

664664
if ("Farr.Jenkins.Paterson" %in% measure)
665-
result[["Farr.Jenkins.Paterson"]] <- -31.517 - 1.015 * W / St + 1.599 * W_1Sy / W
665+
result[["Farr.Jenkins.Paterson"]] <- -31.517 - 1.015 * W / St + 1.599 * W_1Sy / W * 100
666666

667667
if ("Flesch" %in% measure)
668668
result[["Flesch"]] <- 206.835 - 1.015 * W / St - 84.6 * Sy / W

R/textstat_simil.R

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ setMethod("tail", signature(x = "textstat_proxy"), function(x, n = 6L, ...) {
200200
#' @param ... unused
201201
#' @details `textstat_simil` options are: `"correlation"` (default),
202202
#' `"cosine"`, `"jaccard"`, `"ejaccard"`, `"dice"`,
203-
#' `"edice"`, `"simple matching"`, and `"hamman"`.
203+
#' `"edice"`, `"simple matching"`, and `"hamann"`.
204204
#' @note If you want to compute similarity on a "normalized" dfm object
205205
#' (controlling for variable document lengths, for methods such as correlation
206206
#' for which different document lengths matter), then wrap the input dfm in
@@ -249,7 +249,7 @@ setMethod("tail", signature(x = "textstat_proxy"), function(x, n = 6L, ...) {
249249
textstat_simil <- function(x, y = NULL, selection = NULL,
250250
margin = c("documents", "features"),
251251
method = c("correlation", "cosine", "jaccard", "ejaccard",
252-
"dice", "edice", "hamman", "simple matching"),
252+
"dice", "edice", "hamann", "simple matching"),
253253
min_simil = NULL, ...) {
254254
UseMethod("textstat_simil")
255255
}
@@ -258,7 +258,7 @@ textstat_simil <- function(x, y = NULL, selection = NULL,
258258
textstat_simil.default <- function(x, y = NULL, selection = NULL,
259259
margin = c("documents", "features"),
260260
method = c("correlation", "cosine", "jaccard", "ejaccard",
261-
"dice", "edice", "hamman", "simple matching"),
261+
"dice", "edice", "hamann", "simple matching"),
262262
min_simil = NULL, ...) {
263263
stop(friendly_class_undefined_message(class(x), "textstat_simil"))
264264
}
@@ -267,7 +267,7 @@ textstat_simil.default <- function(x, y = NULL, selection = NULL,
267267
textstat_simil.dfm <- function(x, y = NULL, selection = NULL,
268268
margin = c("documents", "features"),
269269
method = c("correlation", "cosine", "jaccard", "ejaccard",
270-
"dice", "edice", "hamman", "simple matching"),
270+
"dice", "edice", "hamann", "simple matching"),
271271
min_simil = NULL, ...) {
272272

273273
if (!is.null(selection))
@@ -276,6 +276,8 @@ textstat_simil.dfm <- function(x, y = NULL, selection = NULL,
276276

277277
x <- as.dfm(x)
278278
margin <- match.arg(margin)
279+
280+
method[method == "hamman"] <- "hamann" # trap older "hamman" spelling
279281
method <- match.arg(method)
280282

281283
if (margin == "features") {
@@ -608,7 +610,7 @@ setMethod("as.matrix", "textstat_simil_symm_sparse",
608610
textstat_proxy <- function(x, y = NULL,
609611
margin = c("documents", "features"),
610612
method = c("cosine", "correlation", "jaccard", "ejaccard",
611-
"dice", "edice", "hamman", "simple matching",
613+
"dice", "edice", "hamann", "simple matching",
612614
"euclidean", "chisquared", "hamming", "kullback",
613615
"manhattan", "maximum", "canberra", "minkowski"),
614616
p = 2, min_proxy = NULL, rank = NULL, use_na = FALSE) {
@@ -622,6 +624,8 @@ textstat_proxy <- function(x, y = NULL,
622624
}
623625

624626
margin <- match.arg(margin)
627+
628+
method[method == "hamman"] <- "hamann" # trap older "hamman" spelling
625629
method <- match.arg(method)
626630

627631
if (margin == "documents") {
@@ -633,7 +637,7 @@ textstat_proxy <- function(x, y = NULL,
633637
stop("x and y must contain the same documents")
634638
}
635639
if (method %in% c("cosine", "correlation", "jaccard", "ejaccard", "dice", "edice",
636-
"hamman", "simple matching", "faith")) {
640+
"hamann", "simple matching", "faith")) {
637641
if (identical(x, y)) {
638642
suppressWarnings({
639643
result <- proxyC::simil(x, NULL, 2, method, min_simil = min_proxy, rank = rank, use_nan = use_na)

man/textstat_proxy.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/textstat_simil.Rd

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-textstat_proxy.R

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,15 @@ test_that("test textstat_proxy simple matching similarity", {
107107
test_simil(test_mt, "simple matching", "features")
108108
})
109109

110-
test_that("test textstat_proxy hamman similarity", {
110+
test_that("test textstat_proxy hamann similarity", {
111111
skip_if_not_installed("proxy")
112112
test_simil(test_mt, "hamman", "documents")
113113
test_simil(test_mt, "hamman", "features")
114+
115+
expect_identical(
116+
textstat_simil(test_mt, method = "hamman"),
117+
textstat_simil(test_mt, method = "hamann")
118+
)
114119
})
115120

116121

tests/testthat/test-textstat_simil.R

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
library("quanteda")
22

3-
mt <- corpus_subset(data_corpus_inaugural, Year > 1980 & Year < 2021) %>%
4-
tokens() %>%
3+
mt <- corpus_subset(data_corpus_inaugural, Year > 1980 & Year < 2021) %>%
4+
tokens() %>%
55
dfm()
66
mt <- dfm_trim(mt, min_termfreq = 10)
77

@@ -98,7 +98,7 @@ test_that("textstat_simil() returns NA for zero-variance documents", {
9898
as.matrix(textstat_simil(mt, method = "cosine")),
9999
mt_na_some
100100
)
101-
101+
102102
# proxy::simil is wrong
103103
# expect_equivalent(
104104
# as.matrix(textstat_simil(mt, method = "jaccard")),
@@ -119,13 +119,13 @@ test_that("textstat_simil() returns NA for zero-variance documents", {
119119
# as.matrix(textstat_simil(mt, method = "edice")),
120120
# mt_na_some
121121
# )
122-
122+
123123
# proxyC::simil is wrong (#44)
124124
# expect_equal(
125125
# as.matrix(textstat_simil(mt, method = "hamman")),
126126
# mt_na_some
127127
# )
128-
128+
129129
# proxy::simil is wrong
130130
# expect_equal(
131131
# as.matrix(textstat_simil(mt, method = "simple matching")),
@@ -165,7 +165,7 @@ test_that("selection is always on columns (#1549)", {
165165

166166
test_that("all similarities are between 0 and 1", {
167167
methods <- c("correlation", "cosine", "jaccard", "ejaccard",
168-
"dice", "edice", "hamman", "simple matching")
168+
"dice", "edice", "hamann", "simple matching")
169169
for (m in methods) {
170170
minmax <- range(textstat_simil(mt, method = m, margin = "documents"))
171171
tol <- .000001
@@ -548,3 +548,10 @@ test_that("as.data.frame works with subsetted object", {
548548
)
549549
expect_identical(levels(simildf$document1), levels(simildf$document1))
550550
})
551+
552+
test_that("hamman still works", {
553+
expect_identical(
554+
textstat_simil(data_dfm_lbgexample, method = "hamman"),
555+
textstat_simil(data_dfm_lbgexample, method = "hamann")
556+
)
557+
})

0 commit comments

Comments
 (0)