koheiw
diff --git a/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion b/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎NEWS.md‎
Lines changed: 5 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎R/as.textmodel.R‎
Lines changed: 6 additions & 6 deletions b/‎R/as.textmodel.R‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎R/textmodel_lss.R‎
Lines changed: 25 additions & 31 deletions b/‎R/textmodel_lss.R‎
Lines changed: 25 additions & 31 deletions
diff --git a/‎R/textmodel_lss2.R‎
Lines changed: 2 additions & 1 deletion b/‎R/textmodel_lss2.R‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎man/as.textmodel_lss.Rd‎
Lines changed: 12 additions & 4 deletions b/‎man/as.textmodel_lss.Rd‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎man/textmodel_lss.Rd‎
Lines changed: 11 additions & 4 deletions b/‎man/textmodel_lss.Rd‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎man/weight_seeds.Rd‎
Lines changed: 1 addition & 1 deletion b/‎man/weight_seeds.Rd‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/testthat/test-as.textmodel.R‎
Lines changed: 0 additions & 22 deletions b/‎tests/testthat/test-as.textmodel.R‎
Lines changed: 0 additions & 22 deletions
@@ -1,7 +1,7 @@
 Package: LSX
 Type: Package
 Title: Semi-Supervised Algorithm for Document Scaling
-Version: 1.5.1
+Version: 1.5.2
 Authors@R: person("Kohei", "Watanabe", email = "watanabe.kohei@gmail.com", role = c("aut", "cre", "cph"))
 Description: A word embeddings-based semi-supervised model for document scaling Watanabe (2020) <doi:10.1080/19312458.2020.1832976>.
     LSS allows users to analyze large and complex corpora on arbitrary dimensions with seed words exploiting efficiency of word embeddings (SVD, Glove).
 
@@ -1,3 +1,8 @@
+## Changes in v1.5.2
+
+* Add `nested_weight` in `textmodel_lss()` and `as.textmodel_lss()` to perform dictionary-like analysis.
+* Remove `auto_weight` in `textmodel_lss()`.
+
 ## Changes in v1.5.1
 
 * Support `textmodel_wordvector` objects from **wordvector** v0.6.0.
 
@@ -27,7 +27,7 @@ as.textmodel_lss <- function(x, ...) {
 as.textmodel_lss.matrix <- function(x, seeds,
                                     terms = NULL, slice = NULL,
                                     simil_method = "cosine",
-                                    auto_weight = FALSE,
+                                    nested_weight = TRUE,
                                     verbose = FALSE, ...) {
 
   args <- list(terms = terms, seeds = seeds)
@@ -38,7 +38,7 @@ as.textmodel_lss.matrix <- function(x, seeds,
   if (any(is.na(x)))
     stop("x must not have NA")
 
-  seeds <- expand_seeds(seeds, colnames(x), verbose)
+  seeds <- expand_seeds(seeds, colnames(x), nested_weight, verbose)
   seed <- unlist(unname(seeds))
   theta <- get_theta(terms, colnames(x))
 
@@ -51,8 +51,6 @@ as.textmodel_lss.matrix <- function(x, seeds,
     slice <- seq_len(slice)
 
   simil <- get_simil(x, names(seed), names(theta), slice, simil_method)
-  if (auto_weight)
-    seed <- optimize_weight(seed, simil, verbose)
   beta <- get_beta(simil, seed) * theta
 
   result <- build_lss(
@@ -108,6 +106,7 @@ as.textmodel_lss.textmodel_lss <- function(x, ...) {
 #' @method as.textmodel_lss textmodel_wordvector
 as.textmodel_lss.textmodel_wordvector <- function(x, seeds,
                                                   terms = NULL,
+                                                  nested_weight = TRUE,
                                                   verbose = FALSE,
                                                   spatial = TRUE,
                                                   ...) {
@@ -123,7 +122,8 @@ as.textmodel_lss.textmodel_wordvector <- function(x, seeds,
     } else {
       values <- x$values
     }
-    result <- as.textmodel_lss(t(values), seeds = seeds, terms = terms, ...)
+    result <- as.textmodel_lss(t(values), seeds = seeds, terms = terms,
+                               nested_weight = nested_weight, ...)
     result$frequency <- x$frequency[names(result$beta)]
     result$type = "word2vec"
     result$call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
@@ -135,7 +135,7 @@ as.textmodel_lss.textmodel_wordvector <- function(x, seeds,
     if (x$version < as.numeric_version("0.2.0"))
       stop("wordvector package must be v0.2.0 or later")
 
-    seeds <- expand_seeds(seeds, names(x$frequency), verbose)
+    seeds <- expand_seeds(seeds, names(x$frequency), nested_weight, verbose)
     seed <- unlist(unname(seeds))
     theta <- get_theta(terms, names(x$frequency))
 
 
@@ -25,8 +25,8 @@
 #'    If `x` is a dfm, [RSpectra::svds()], [irlba::irlba()] or [rsvd::rsvd()].
 #'    If `x` is a fcm, [rsparse::GloVe()].
 #'    If `x` is a tokens (or tokens_xptr), [wordvector::textmodel_word2vec()].
-#' @param auto_weight automatically determine weights to approximate the
-#'   polarity of terms to seed words. Deprecated.
+#' @param nested_weight if `TRUE`, assign smaller weights to seed words based on
+#'    the number of glob pattern matches.
 #' @param verbose show messages if `TRUE`.
 #' @param ... additional arguments passed to the underlying engine.
 #' @export
@@ -57,6 +57,12 @@
 #'   spatial models, they are predicted probability that the seed words to occur in
 #'   their contexts. The probabilistic models are still experimental, so use them with caution.
 #'
+#'   `nested_weight = TRUE` to limit the impact of glob patterns used in seed words.
+#'   When it is `FALSE`, the weights of the seed words are all equal being the inverse of
+#'   the number of seed words matched. When it is `TRUE`, the weights are equally distributed
+#'   within the same glob pattern. LSS becomes more similar to dictionary analysis
+#'   when it is `FALSE`.
+#'
 #'   Please visit the [package website](https://koheiw.github.io/LSX/) for examples.
 #' @references Watanabe, Kohei. 2020. "Latent Semantic Scaling: A Semisupervised
 #'   Text Analysis Technique for New Domains and Languages", Communication
@@ -87,7 +93,7 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
                               weight = "count", cache = FALSE,
                               simil_method = "cosine",
                               engine = c("RSpectra", "irlba", "rsvd"),
-                              auto_weight = FALSE,
+                              nested_weight = TRUE,
                               include_data = FALSE,
                               group_data = FALSE,
                               verbose = FALSE, ...) {
@@ -100,7 +106,7 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
 
     k <- check_integer(k, min_len = 1, max_len = 1, min = 2, max = nrow(x))
     engine <- match.arg(engine)
-    seeds <- expand_seeds(seeds, featnames(x), verbose)
+    seeds <- expand_seeds(seeds, featnames(x), nested_weight, verbose)
     seed <- unlist(unname(seeds))
     theta <- get_theta(terms, featnames(x))
     feat <- union(names(theta), names(seed))
@@ -123,9 +129,6 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
         slice <- seq_len(slice)
 
     simil <- get_simil(embed, names(seed), names(theta), slice, simil_method)
-    if (auto_weight)
-        seed <- optimize_weight(seed, simil, verbose)
-
     beta <- get_beta(simil, seed) * theta
 
     result <- build_lss(
@@ -169,7 +172,7 @@ textmodel_lss.fcm <- function(x, seeds, terms = NULL, k = 50,
                               weight = "count", cache = FALSE,
                               simil_method = "cosine",
                               engine = "rsparse",
-                              auto_weight = FALSE,
+                              nested_weight = TRUE,
                               verbose = FALSE, ...) {
 
     args <- list(terms = terms, seeds = seeds, ...)
@@ -186,7 +189,7 @@ textmodel_lss.fcm <- function(x, seeds, terms = NULL, k = 50,
         k <- args$w
     }
 
-    seeds <- expand_seeds(seeds, featnames(x), verbose)
+    seeds <- expand_seeds(seeds, featnames(x), nested_weight, verbose)
     seed <- unlist(unname(seeds))
     term <- expand_terms(terms, featnames(x))
     feat <- union(term, names(seed))
@@ -201,9 +204,6 @@ textmodel_lss.fcm <- function(x, seeds, terms = NULL, k = 50,
     }
 
     simil <- get_simil(embed, names(seed), term, seq_len(k), simil_method)
-    if (auto_weight)
-        seed <- optimize_weight(seed, simil, verbose)
-
     beta <- get_beta(simil, seed)
 
     result <- build_lss(
@@ -262,18 +262,14 @@ expand_terms <- function(terms, features) {
     return(result)
 }
 
-expand_seeds <- function(seeds, features, verbose = FALSE) {
+expand_seeds <- function(seeds, features, nested_weight = TRUE, verbose = FALSE) {
 
     seeds <- get_seeds(seeds)
-    seeds_weighted <- weight_seeds(seeds, features)
+    seeds_weighted <- weight_seeds(seeds, features, nested_weight)
 
     if (all(lengths(seeds_weighted) == 0))
         stop("No seed word is found in the dfm", call. = FALSE)
 
-    if (verbose)
-        cat(sprintf("Calculating term-term similarity to %d seed words...\n",
-            sum(lengths(seeds_weighted))))
-
     return(seeds_weighted)
 }
 
@@ -431,33 +427,31 @@ coefficients.textmodel_lss <- function(object, ...) {
 #' Internal function to generate equally-weighted seed set
 #'
 #' @keywords internal
-weight_seeds <- function(seeds, type) {
+weight_seeds <- function(seeds, type, nested_weight = TRUE) {
     seeds_fix <- lapply(names(seeds), function(x) {
         s <- unlist(quanteda::pattern2fixed(x, type, "glob", FALSE))
         if (is.null(s))
             return(character())
         return(s)
     })
-    weight <- 1 / table(seeds > 0)
+    if (nested_weight) {
+      weight <- 1 / xtabs(~ seeds > 0)
+    } else {
+      weight <- 1 / xtabs(lengths(seeds_fix) ~ seeds > 0)
+    }
     mapply(function(x, y) {
               if (!length(y))
                   return(numeric())
-              v <- unname(x * weight[as.character(x > 0)]) / length(y)
+              if (nested_weight) {
+                v <- unname(x * weight[as.character(x > 0)]) / length(y)
+              } else {
+                v <- unname(x * weight[as.character(x > 0)])
+              }
               v <- rep(v, length(y))
               names(v) <- y
               return(v)
            }, seeds, seeds_fix, SIMPLIFY = FALSE)
 }
 
-# automatically align polarity score with original weight
-optimize_weight <- function(seed, simil, verbose) {
-    .Deprecated(old = "auto_weight")
-    if (verbose)
-        cat("Optimizing seed weights...\n")
-    result <- optim(seed, function(x) {
-        sum((rowSums(simil$seeds %*% x) - seed) ^ 2)
-    })
-    return(result$par)
-}
 
 
@@ -7,6 +7,7 @@ textmodel_lss.tokens <- function(x, seeds, terms = NULL, k = 200,
                                  min_count = 5,
                                  engine = "wordvector",
                                  tolower = TRUE,
+                                 nested_weight = TRUE,
                                  include_data = FALSE,
                                  group_data = FALSE,
                                  spatial = TRUE,
@@ -25,7 +26,7 @@ textmodel_lss.tokens <- function(x, seeds, terms = NULL, k = 200,
                                         type = "skip-gram", tolower = tolower,
                                         normalize = FALSE, verbose = verbose, ...)
   result <- as.textmodel_lss(w2v, seeds = seeds, terms = terms, spatial = spatial,
-                             verbose = FALSE)
+                             nested_weight = nested_weight, verbose = FALSE)
   result$type <- "word2vec"
   result$call <- try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
 
 
@@ -159,28 +159,6 @@ test_that("as.textmodel_lss errors with vector", {
                  "x must not have NA")
 })
 
-test_that("auto_weight is working", {
-    skip_on_cran()
-
-    lss1 <- as.textmodel_lss(mat_test, seed)
-    suppressWarnings({
-      lss2 <- as.textmodel_lss(mat_test, seed, auto_weight = TRUE)
-    })
-    expect_true(
-        all(lss1$seeds_weighted != lss2$seeds_weighted)
-    )
-    expect_true(
-        all(sign(lss1$seeds_weighted) == sign(lss2$seeds_weighted))
-    )
-    expect_true(
-        all(abs(lss2$beta[names(lss2$seeds_weighted)] - lss1$seeds_weighted) < 0.05)
-    )
-    expect_warning(
-        as.textmodel_lss(mat_test, seed, auto_weight = TRUE, verbose = FALSE),
-        "'auto_weight' is deprecated"
-    )
-})
-
 test_that("terms is working", {
     skip_on_cran()