Skip to content

Commit 682e906

Browse files
authored
Merge pull request #122 from koheiw/dev-adjust_weight
Dev adjust weight
2 parents 8e145ce + f366b4c commit 682e906

12 files changed

Lines changed: 187 additions & 108 deletions

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: LSX
22
Type: Package
33
Title: Semi-Supervised Algorithm for Document Scaling
4-
Version: 1.5.1
4+
Version: 1.5.2
55
Authors@R: person("Kohei", "Watanabe", email = "watanabe.kohei@gmail.com", role = c("aut", "cre", "cph"))
66
Description: A word embeddings-based semi-supervised model for document scaling Watanabe (2020) <doi:10.1080/19312458.2020.1832976>.
77
LSS allows users to analyze large and complex corpora on arbitrary dimensions with seed words exploiting efficiency of word embeddings (SVD, Glove).

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## Changes in v1.5.2
2+
3+
* Add `nested_weight` in `textmodel_lss()` and `as.textmodel_lss()` to perform dictionary-like analysis.
4+
* Remove `auto_weight` in `textmodel_lss()`.
5+
16
## Changes in v1.5.1
27

38
* Support `textmodel_wordvector` objects from **wordvector** v0.6.0.

R/as.textmodel.R

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ as.textmodel_lss <- function(x, ...) {
2727
as.textmodel_lss.matrix <- function(x, seeds,
2828
terms = NULL, slice = NULL,
2929
simil_method = "cosine",
30-
auto_weight = FALSE,
30+
nested_weight = TRUE,
3131
verbose = FALSE, ...) {
3232

3333
args <- list(terms = terms, seeds = seeds)
@@ -38,7 +38,7 @@ as.textmodel_lss.matrix <- function(x, seeds,
3838
if (any(is.na(x)))
3939
stop("x must not have NA")
4040

41-
seeds <- expand_seeds(seeds, colnames(x), verbose)
41+
seeds <- expand_seeds(seeds, colnames(x), nested_weight, verbose)
4242
seed <- unlist(unname(seeds))
4343
theta <- get_theta(terms, colnames(x))
4444

@@ -51,8 +51,6 @@ as.textmodel_lss.matrix <- function(x, seeds,
5151
slice <- seq_len(slice)
5252

5353
simil <- get_simil(x, names(seed), names(theta), slice, simil_method)
54-
if (auto_weight)
55-
seed <- optimize_weight(seed, simil, verbose)
5654
beta <- get_beta(simil, seed) * theta
5755

5856
result <- build_lss(
@@ -108,6 +106,7 @@ as.textmodel_lss.textmodel_lss <- function(x, ...) {
108106
#' @method as.textmodel_lss textmodel_wordvector
109107
as.textmodel_lss.textmodel_wordvector <- function(x, seeds,
110108
terms = NULL,
109+
nested_weight = TRUE,
111110
verbose = FALSE,
112111
spatial = TRUE,
113112
...) {
@@ -123,7 +122,8 @@ as.textmodel_lss.textmodel_wordvector <- function(x, seeds,
123122
} else {
124123
values <- x$values
125124
}
126-
result <- as.textmodel_lss(t(values), seeds = seeds, terms = terms, ...)
125+
result <- as.textmodel_lss(t(values), seeds = seeds, terms = terms,
126+
nested_weight = nested_weight, ...)
127127
result$frequency <- x$frequency[names(result$beta)]
128128
result$type = "word2vec"
129129
result$call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
@@ -135,7 +135,7 @@ as.textmodel_lss.textmodel_wordvector <- function(x, seeds,
135135
if (x$version < as.numeric_version("0.2.0"))
136136
stop("wordvector package must be v0.2.0 or later")
137137

138-
seeds <- expand_seeds(seeds, names(x$frequency), verbose)
138+
seeds <- expand_seeds(seeds, names(x$frequency), nested_weight, verbose)
139139
seed <- unlist(unname(seeds))
140140
theta <- get_theta(terms, names(x$frequency))
141141

R/textmodel_lss.R

Lines changed: 25 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
#' If `x` is a dfm, [RSpectra::svds()], [irlba::irlba()] or [rsvd::rsvd()].
2626
#' If `x` is a fcm, [rsparse::GloVe()].
2727
#' If `x` is a tokens (or tokens_xptr), [wordvector::textmodel_word2vec()].
28-
#' @param auto_weight automatically determine weights to approximate the
29-
#' polarity of terms to seed words. Deprecated.
28+
#' @param nested_weight if `TRUE`, assign smaller weights to seed words based on
29+
#' the number of glob pattern matches.
3030
#' @param verbose show messages if `TRUE`.
3131
#' @param ... additional arguments passed to the underlying engine.
3232
#' @export
@@ -57,6 +57,12 @@
5757
#' spatial models, they are predicted probability that the seed words to occur in
5858
#' their contexts. The probabilistic models are still experimental, so use them with caution.
5959
#'
60+
#' `nested_weight = TRUE` to limit the impact of glob patterns used in seed words.
61+
#' When it is `FALSE`, the weights of the seed words are all equal being the inverse of
62+
#' the number of seed words matched. When it is `TRUE`, the weights are equally distributed
63+
#' within the same glob pattern. LSS becomes more similar to dictionary analysis
64+
#' when it is `FALSE`.
65+
#'
6066
#' Please visit the [package website](https://koheiw.github.io/LSX/) for examples.
6167
#' @references Watanabe, Kohei. 2020. "Latent Semantic Scaling: A Semisupervised
6268
#' Text Analysis Technique for New Domains and Languages", Communication
@@ -87,7 +93,7 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
8793
weight = "count", cache = FALSE,
8894
simil_method = "cosine",
8995
engine = c("RSpectra", "irlba", "rsvd"),
90-
auto_weight = FALSE,
96+
nested_weight = TRUE,
9197
include_data = FALSE,
9298
group_data = FALSE,
9399
verbose = FALSE, ...) {
@@ -100,7 +106,7 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
100106

101107
k <- check_integer(k, min_len = 1, max_len = 1, min = 2, max = nrow(x))
102108
engine <- match.arg(engine)
103-
seeds <- expand_seeds(seeds, featnames(x), verbose)
109+
seeds <- expand_seeds(seeds, featnames(x), nested_weight, verbose)
104110
seed <- unlist(unname(seeds))
105111
theta <- get_theta(terms, featnames(x))
106112
feat <- union(names(theta), names(seed))
@@ -123,9 +129,6 @@ textmodel_lss.dfm <- function(x, seeds, terms = NULL, k = 300, slice = NULL,
123129
slice <- seq_len(slice)
124130

125131
simil <- get_simil(embed, names(seed), names(theta), slice, simil_method)
126-
if (auto_weight)
127-
seed <- optimize_weight(seed, simil, verbose)
128-
129132
beta <- get_beta(simil, seed) * theta
130133

131134
result <- build_lss(
@@ -169,7 +172,7 @@ textmodel_lss.fcm <- function(x, seeds, terms = NULL, k = 50,
169172
weight = "count", cache = FALSE,
170173
simil_method = "cosine",
171174
engine = "rsparse",
172-
auto_weight = FALSE,
175+
nested_weight = TRUE,
173176
verbose = FALSE, ...) {
174177

175178
args <- list(terms = terms, seeds = seeds, ...)
@@ -186,7 +189,7 @@ textmodel_lss.fcm <- function(x, seeds, terms = NULL, k = 50,
186189
k <- args$w
187190
}
188191

189-
seeds <- expand_seeds(seeds, featnames(x), verbose)
192+
seeds <- expand_seeds(seeds, featnames(x), nested_weight, verbose)
190193
seed <- unlist(unname(seeds))
191194
term <- expand_terms(terms, featnames(x))
192195
feat <- union(term, names(seed))
@@ -201,9 +204,6 @@ textmodel_lss.fcm <- function(x, seeds, terms = NULL, k = 50,
201204
}
202205

203206
simil <- get_simil(embed, names(seed), term, seq_len(k), simil_method)
204-
if (auto_weight)
205-
seed <- optimize_weight(seed, simil, verbose)
206-
207207
beta <- get_beta(simil, seed)
208208

209209
result <- build_lss(
@@ -262,18 +262,14 @@ expand_terms <- function(terms, features) {
262262
return(result)
263263
}
264264

265-
expand_seeds <- function(seeds, features, verbose = FALSE) {
265+
expand_seeds <- function(seeds, features, nested_weight = TRUE, verbose = FALSE) {
266266

267267
seeds <- get_seeds(seeds)
268-
seeds_weighted <- weight_seeds(seeds, features)
268+
seeds_weighted <- weight_seeds(seeds, features, nested_weight)
269269

270270
if (all(lengths(seeds_weighted) == 0))
271271
stop("No seed word is found in the dfm", call. = FALSE)
272272

273-
if (verbose)
274-
cat(sprintf("Calculating term-term similarity to %d seed words...\n",
275-
sum(lengths(seeds_weighted))))
276-
277273
return(seeds_weighted)
278274
}
279275

@@ -431,33 +427,31 @@ coefficients.textmodel_lss <- function(object, ...) {
431427
#' Internal function to generate equally-weighted seed set
432428
#'
433429
#' @keywords internal
434-
weight_seeds <- function(seeds, type) {
430+
weight_seeds <- function(seeds, type, nested_weight = TRUE) {
435431
seeds_fix <- lapply(names(seeds), function(x) {
436432
s <- unlist(quanteda::pattern2fixed(x, type, "glob", FALSE))
437433
if (is.null(s))
438434
return(character())
439435
return(s)
440436
})
441-
weight <- 1 / table(seeds > 0)
437+
if (nested_weight) {
438+
weight <- 1 / xtabs(~ seeds > 0)
439+
} else {
440+
weight <- 1 / xtabs(lengths(seeds_fix) ~ seeds > 0)
441+
}
442442
mapply(function(x, y) {
443443
if (!length(y))
444444
return(numeric())
445-
v <- unname(x * weight[as.character(x > 0)]) / length(y)
445+
if (nested_weight) {
446+
v <- unname(x * weight[as.character(x > 0)]) / length(y)
447+
} else {
448+
v <- unname(x * weight[as.character(x > 0)])
449+
}
446450
v <- rep(v, length(y))
447451
names(v) <- y
448452
return(v)
449453
}, seeds, seeds_fix, SIMPLIFY = FALSE)
450454
}
451455

452-
# automatically align polarity score with original weight
453-
optimize_weight <- function(seed, simil, verbose) {
454-
.Deprecated(old = "auto_weight")
455-
if (verbose)
456-
cat("Optimizing seed weights...\n")
457-
result <- optim(seed, function(x) {
458-
sum((rowSums(simil$seeds %*% x) - seed) ^ 2)
459-
})
460-
return(result$par)
461-
}
462456

463457

R/textmodel_lss2.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ textmodel_lss.tokens <- function(x, seeds, terms = NULL, k = 200,
77
min_count = 5,
88
engine = "wordvector",
99
tolower = TRUE,
10+
nested_weight = TRUE,
1011
include_data = FALSE,
1112
group_data = FALSE,
1213
spatial = TRUE,
@@ -25,7 +26,7 @@ textmodel_lss.tokens <- function(x, seeds, terms = NULL, k = 200,
2526
type = "skip-gram", tolower = tolower,
2627
normalize = FALSE, verbose = verbose, ...)
2728
result <- as.textmodel_lss(w2v, seeds = seeds, terms = terms, spatial = spatial,
28-
verbose = FALSE)
29+
nested_weight = nested_weight, verbose = FALSE)
2930
result$type <- "word2vec"
3031
result$call <- try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
3132

man/as.textmodel_lss.Rd

Lines changed: 12 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/textmodel_lss.Rd

Lines changed: 11 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/weight_seeds.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-as.textmodel.R

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -159,28 +159,6 @@ test_that("as.textmodel_lss errors with vector", {
159159
"x must not have NA")
160160
})
161161

162-
test_that("auto_weight is working", {
163-
skip_on_cran()
164-
165-
lss1 <- as.textmodel_lss(mat_test, seed)
166-
suppressWarnings({
167-
lss2 <- as.textmodel_lss(mat_test, seed, auto_weight = TRUE)
168-
})
169-
expect_true(
170-
all(lss1$seeds_weighted != lss2$seeds_weighted)
171-
)
172-
expect_true(
173-
all(sign(lss1$seeds_weighted) == sign(lss2$seeds_weighted))
174-
)
175-
expect_true(
176-
all(abs(lss2$beta[names(lss2$seeds_weighted)] - lss1$seeds_weighted) < 0.05)
177-
)
178-
expect_warning(
179-
as.textmodel_lss(mat_test, seed, auto_weight = TRUE, verbose = FALSE),
180-
"'auto_weight' is deprecated"
181-
)
182-
})
183-
184162
test_that("terms is working", {
185163
skip_on_cran()
186164

0 commit comments

Comments
 (0)