Merge pull request #48 from quanteda/fix-hamman

kbenoit · web-flow · commit bf0b8ab7d39f · 2021-11-16T09:03:20.000Z
Change hamman to hamann
diff --git a/R/textstat_readability.R b/R/textstat_readability.R
@@ -662,7 +662,7 @@ textstat_readability.corpus <- function(x,
         result[["ELF"]] <- W2Sy / St
 
     if ("Farr.Jenkins.Paterson" %in% measure)
-        result[["Farr.Jenkins.Paterson"]] <- -31.517 - 1.015 * W / St + 1.599 * W_1Sy / W
+        result[["Farr.Jenkins.Paterson"]] <- -31.517 - 1.015 * W / St + 1.599 * W_1Sy / W * 100
 
     if ("Flesch" %in% measure)
         result[["Flesch"]] <- 206.835 - 1.015 * W / St - 84.6 * Sy / W
diff --git a/R/textstat_simil.R b/R/textstat_simil.R
@@ -200,7 +200,7 @@ setMethod("tail", signature(x = "textstat_proxy"), function(x, n = 6L, ...) {
 #' @param ... unused
 #' @details `textstat_simil` options are: `"correlation"` (default),
 #'   `"cosine"`, `"jaccard"`, `"ejaccard"`, `"dice"`,
-#'   `"edice"`, `"simple matching"`, and `"hamman"`.
+#'   `"edice"`, `"simple matching"`, and `"hamann"`.
 #' @note If you want to compute similarity on a "normalized" dfm object
 #'   (controlling for variable document lengths, for methods such as correlation
 #'   for which different document lengths matter), then wrap the input dfm in
@@ -249,7 +249,7 @@ setMethod("tail", signature(x = "textstat_proxy"), function(x, n = 6L, ...) {
 textstat_simil <- function(x, y = NULL, selection = NULL,
                            margin = c("documents", "features"),
                            method = c("correlation", "cosine", "jaccard", "ejaccard",
-                                      "dice", "edice", "hamman", "simple matching"),
+                                      "dice", "edice", "hamann", "simple matching"),
                            min_simil = NULL, ...) {
     UseMethod("textstat_simil")
 }
@@ -258,7 +258,7 @@ textstat_simil <- function(x, y = NULL, selection = NULL,
 textstat_simil.default <- function(x, y = NULL, selection = NULL,
                                margin = c("documents", "features"),
                                method = c("correlation", "cosine", "jaccard", "ejaccard",
-                                          "dice", "edice", "hamman", "simple matching"),
+                                          "dice", "edice", "hamann", "simple matching"),
                                min_simil = NULL, ...) {
     stop(friendly_class_undefined_message(class(x), "textstat_simil"))
 }
@@ -267,7 +267,7 @@ textstat_simil.default <- function(x, y = NULL, selection = NULL,
 textstat_simil.dfm <- function(x, y = NULL, selection = NULL,
                                margin = c("documents", "features"),
                                method = c("correlation", "cosine", "jaccard", "ejaccard",
-                                          "dice", "edice", "hamman", "simple matching"),
+                                          "dice", "edice", "hamann", "simple matching"),
                                min_simil = NULL, ...) {
 
     if (!is.null(selection))
@@ -276,6 +276,8 @@ textstat_simil.dfm <- function(x, y = NULL, selection = NULL,
 
     x <- as.dfm(x)
     margin <- match.arg(margin)
+
+    method[method == "hamman"] <- "hamann" # trap older "hamman" spelling
     method <- match.arg(method)
 
     if (margin == "features") {
@@ -608,7 +610,7 @@ setMethod("as.matrix", "textstat_simil_symm_sparse",
 textstat_proxy <- function(x, y = NULL,
                            margin = c("documents", "features"),
                            method = c("cosine", "correlation", "jaccard", "ejaccard",
-                                      "dice", "edice", "hamman", "simple matching",
+                                      "dice", "edice", "hamann", "simple matching",
                                       "euclidean", "chisquared", "hamming", "kullback",
                                       "manhattan", "maximum", "canberra", "minkowski"),
                            p = 2, min_proxy = NULL, rank = NULL, use_na = FALSE) {
@@ -622,6 +624,8 @@ textstat_proxy <- function(x, y = NULL,
     }
 
     margin <- match.arg(margin)
+
+    method[method == "hamman"] <- "hamann" # trap older "hamman" spelling
     method <- match.arg(method)
 
     if (margin == "documents") {
@@ -633,7 +637,7 @@ textstat_proxy <- function(x, y = NULL,
             stop("x and y must contain the same documents")
     }
     if (method %in% c("cosine", "correlation", "jaccard", "ejaccard", "dice", "edice",
-                      "hamman", "simple matching", "faith")) {
+                      "hamann", "simple matching", "faith")) {
         if (identical(x, y)) {
             suppressWarnings({
                 result <- proxyC::simil(x, NULL, 2, method, min_simil = min_proxy, rank = rank, use_nan = use_na)
diff --git a/man/textstat_proxy.Rd b/man/textstat_proxy.Rd
diff --git a/man/textstat_simil.Rd b/man/textstat_simil.Rd
diff --git a/tests/testthat/test-textstat_proxy.R b/tests/testthat/test-textstat_proxy.R
@@ -107,10 +107,15 @@ test_that("test textstat_proxy simple matching similarity", {
     test_simil(test_mt, "simple matching", "features")
 })
 
-test_that("test textstat_proxy hamman similarity", {
+test_that("test textstat_proxy hamann similarity", {
     skip_if_not_installed("proxy")
     test_simil(test_mt, "hamman", "documents")
     test_simil(test_mt, "hamman", "features")
+
+    expect_identical(
+        textstat_simil(test_mt, method = "hamman"),
+        textstat_simil(test_mt, method = "hamann")
+    )
 })
 
 
diff --git a/tests/testthat/test-textstat_simil.R b/tests/testthat/test-textstat_simil.R
@@ -1,7 +1,7 @@
 library("quanteda")
 
-mt <- corpus_subset(data_corpus_inaugural, Year > 1980 & Year < 2021) %>% 
-  tokens() %>% 
+mt <- corpus_subset(data_corpus_inaugural, Year > 1980 & Year < 2021) %>%
+  tokens() %>%
   dfm()
 mt <- dfm_trim(mt, min_termfreq = 10)
 
@@ -98,7 +98,7 @@ test_that("textstat_simil() returns NA for zero-variance documents", {
         as.matrix(textstat_simil(mt, method = "cosine")),
         mt_na_some
     )
-  
+
     # proxy::simil is wrong
     # expect_equivalent(
     #      as.matrix(textstat_simil(mt, method = "jaccard")),
@@ -119,13 +119,13 @@ test_that("textstat_simil() returns NA for zero-variance documents", {
     #     as.matrix(textstat_simil(mt, method = "edice")),
     #     mt_na_some
     # )
-    
+
     # proxyC::simil is wrong (#44)
     # expect_equal(
     #     as.matrix(textstat_simil(mt, method = "hamman")),
     #     mt_na_some
     # )
-    
+
     # proxy::simil is wrong
     # expect_equal(
     #     as.matrix(textstat_simil(mt, method = "simple matching")),
@@ -165,7 +165,7 @@ test_that("selection is always on columns (#1549)", {
 
 test_that("all similarities are between 0 and 1", {
     methods <- c("correlation", "cosine", "jaccard", "ejaccard",
-                 "dice", "edice", "hamman", "simple matching")
+                 "dice", "edice", "hamann", "simple matching")
     for (m in methods) {
         minmax <- range(textstat_simil(mt, method = m, margin = "documents"))
         tol <- .000001
@@ -548,3 +548,10 @@ test_that("as.data.frame works with subsetted object", {
     )
     expect_identical(levels(simildf$document1), levels(simildf$document1))
 })
+
+test_that("hamman still works", {
+    expect_identical(
+        textstat_simil(data_dfm_lbgexample, method = "hamman"),
+        textstat_simil(data_dfm_lbgexample, method = "hamann")
+    )
+})