2525# ' If `x` is a dfm, [RSpectra::svds()], [irlba::irlba()] or [rsvd::rsvd()].
2626# ' If `x` is a fcm, [rsparse::GloVe()].
2727# ' If `x` is a tokens (or tokens_xptr), [wordvector::textmodel_word2vec()].
28- # ' @param auto_weight automatically determine weights to approximate the
29- # ' polarity of terms to seed words. Deprecated .
28+ # ' @param nested_weight if `TRUE`, assign smaller weights to seed words based on
29+ # ' the number of glob pattern matches .
3030# ' @param verbose show messages if `TRUE`.
3131# ' @param ... additional arguments passed to the underlying engine.
3232# ' @export
5757# ' spatial models, they are predicted probability that the seed words to occur in
5858# ' their contexts. The probabilistic models are still experimental, so use them with caution.
5959# '
60+ # ' `nested_weight = TRUE` to limit the impact of glob patterns used in seed words.
61+ # ' When it is `FALSE`, the weights of the seed words are all equal being the inverse of
62+ # ' the number of seed words matched. When it is `TRUE`, the weights are equally distributed
63+ # ' within the same glob pattern. LSS becomes more similar to dictionary analysis
64+ # ' when it is `FALSE`.
65+ # '
6066# ' Please visit the [package website](https://koheiw.github.io/LSX/) for examples.
6167# ' @references Watanabe, Kohei. 2020. "Latent Semantic Scaling: A Semisupervised
6268# ' Text Analysis Technique for New Domains and Languages", Communication
@@ -87,7 +93,7 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
8793 weight = " count" , cache = FALSE ,
8894 simil_method = " cosine" ,
8995 engine = c(" RSpectra" , " irlba" , " rsvd" ),
90- auto_weight = FALSE ,
96+ nested_weight = TRUE ,
9197 include_data = FALSE ,
9298 group_data = FALSE ,
9399 verbose = FALSE , ... ) {
@@ -100,7 +106,7 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
100106
101107 k <- check_integer(k , min_len = 1 , max_len = 1 , min = 2 , max = nrow(x ))
102108 engine <- match.arg(engine )
103- seeds <- expand_seeds(seeds , featnames(x ), verbose )
109+ seeds <- expand_seeds(seeds , featnames(x ), nested_weight , verbose )
104110 seed <- unlist(unname(seeds ))
105111 theta <- get_theta(terms , featnames(x ))
106112 feat <- union(names(theta ), names(seed ))
@@ -123,9 +129,6 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
123129 slice <- seq_len(slice )
124130
125131 simil <- get_simil(embed , names(seed ), names(theta ), slice , simil_method )
126- if (auto_weight )
127- seed <- optimize_weight(seed , simil , verbose )
128-
129132 beta <- get_beta(simil , seed ) * theta
130133
131134 result <- build_lss(
@@ -169,7 +172,7 @@ textmodel_lss.fcm <- function(x, seeds, terms = NULL, k = 50,
169172 weight = " count" , cache = FALSE ,
170173 simil_method = " cosine" ,
171174 engine = " rsparse" ,
172- auto_weight = FALSE ,
175+ nested_weight = TRUE ,
173176 verbose = FALSE , ... ) {
174177
175178 args <- list (terms = terms , seeds = seeds , ... )
@@ -186,7 +189,7 @@ textmodel_lss.fcm <- function(x, seeds, terms = NULL, k = 50,
186189 k <- args $ w
187190 }
188191
189- seeds <- expand_seeds(seeds , featnames(x ), verbose )
192+ seeds <- expand_seeds(seeds , featnames(x ), nested_weight , verbose )
190193 seed <- unlist(unname(seeds ))
191194 term <- expand_terms(terms , featnames(x ))
192195 feat <- union(term , names(seed ))
@@ -201,9 +204,6 @@ textmodel_lss.fcm <- function(x, seeds, terms = NULL, k = 50,
201204 }
202205
203206 simil <- get_simil(embed , names(seed ), term , seq_len(k ), simil_method )
204- if (auto_weight )
205- seed <- optimize_weight(seed , simil , verbose )
206-
207207 beta <- get_beta(simil , seed )
208208
209209 result <- build_lss(
@@ -262,18 +262,14 @@ expand_terms <- function(terms, features) {
262262 return (result )
263263}
264264
265- expand_seeds <- function (seeds , features , verbose = FALSE ) {
265+ expand_seeds <- function (seeds , features , nested_weight = TRUE , verbose = FALSE ) {
266266
267267 seeds <- get_seeds(seeds )
268- seeds_weighted <- weight_seeds(seeds , features )
268+ seeds_weighted <- weight_seeds(seeds , features , nested_weight )
269269
270270 if (all(lengths(seeds_weighted ) == 0 ))
271271 stop(" No seed word is found in the dfm" , call. = FALSE )
272272
273- if (verbose )
274- cat(sprintf(" Calculating term-term similarity to %d seed words...\n " ,
275- sum(lengths(seeds_weighted ))))
276-
277273 return (seeds_weighted )
278274}
279275
@@ -431,33 +427,31 @@ coefficients.textmodel_lss <- function(object, ...) {
431427# ' Internal function to generate equally-weighted seed set
432428# '
433429# ' @keywords internal
434- weight_seeds <- function (seeds , type ) {
430+ weight_seeds <- function (seeds , type , nested_weight = TRUE ) {
435431 seeds_fix <- lapply(names(seeds ), function (x ) {
436432 s <- unlist(quanteda :: pattern2fixed(x , type , " glob" , FALSE ))
437433 if (is.null(s ))
438434 return (character ())
439435 return (s )
440436 })
441- weight <- 1 / table(seeds > 0 )
437+ if (nested_weight ) {
438+ weight <- 1 / xtabs(~ seeds > 0 )
439+ } else {
440+ weight <- 1 / xtabs(lengths(seeds_fix ) ~ seeds > 0 )
441+ }
442442 mapply(function (x , y ) {
443443 if (! length(y ))
444444 return (numeric ())
445- v <- unname(x * weight [as.character(x > 0 )]) / length(y )
445+ if (nested_weight ) {
446+ v <- unname(x * weight [as.character(x > 0 )]) / length(y )
447+ } else {
448+ v <- unname(x * weight [as.character(x > 0 )])
449+ }
446450 v <- rep(v , length(y ))
447451 names(v ) <- y
448452 return (v )
449453 }, seeds , seeds_fix , SIMPLIFY = FALSE )
450454}
451455
452- # automatically align polarity score with original weight
453- optimize_weight <- function (seed , simil , verbose ) {
454- .Deprecated(old = " auto_weight" )
455- if (verbose )
456- cat(" Optimizing seed weights...\n " )
457- result <- optim(seed , function (x ) {
458- sum((rowSums(simil $ seeds %*% x ) - seed ) ^ 2 )
459- })
460- return (result $ par )
461- }
462456
463457
0 commit comments