Skip to content

Commit 82784fa

Browse files
committed
Merge branch 'permute'
2 parents 2235503 + 2d07972 commit 82784fa

File tree

11 files changed

+224
-23
lines changed

11 files changed

+224
-23
lines changed

R/align.R

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,12 @@ derive one from a list of models.")
203203
}
204204

205205
cl <- naive_cluster(dst_flat, K, threshold)
206+
206207
structure(
207208
list(
208-
# naive_cluster numbers clusters from 0
209-
clusters=lapply(cl$clusters, `+`, 1),
209+
# relabel clusters as sequential numbers from 1
210+
clusters=lapply(cl$clusters, match,
211+
sort(unique(unlist(cl$clusters)))),
210212
distances=cl$distances,
211213
model_distances=dst,
212214
threshold=threshold
@@ -262,8 +264,8 @@ alignment_frame <- function (clusters) {
262264
#'
263265
#' @param x result from \code{\link{align_topics}}
264266
#'
265-
#' @return a vector whose \code{i}th element is the width of cluster \code{i}.
266-
#' If there is no cluster with that number, the corresponding element is
267+
#' @return a vector whose \code{i}th element is the width of cluster \code{i}.
268+
#' If there is no cluster with that number, the corresponding element is
267269
#' \code{NA}. Single-member clusters have a width of zero.
268270
#'
269271
#' @seealso \code{\link{align_topics}}, \code{\link{alignment_frame}}
@@ -278,3 +280,84 @@ widths.topic_alignment <- function (x) {
278280
)
279281
}
280282

283+
# The naivest cluster algorithm
284+
#
285+
# For testing purposes, this function implements the single-linkage clustering
286+
# algorithm described in
287+
# \href{https://en.wikipedia.org/wiki/Single-linkage_clustering}{Wikipedia}.
288+
# It should yield the same clustering as \code{\link{align_topics}} (for the
289+
# sketch of a proof, see comments on the source code in \code{cluster.cpp}).
290+
#
291+
naivest_cluster <- function (dst, threshold=Inf, verbose=FALSE) {
292+
K <- c(nrow(dst$d[[1]][[1]]),
293+
vapply(dst$d[[1]], ncol, integer(1)))
294+
M <- length(K)
295+
# model membership indicator for topic sequence
296+
ms <- rep(seq_along(K), times=K)
297+
# topic indicator
298+
ks <- do.call(c, lapply(K, seq))
299+
300+
# construct upper-tri distance matrix D (probably faster ways to do this)
301+
D <- matrix(NA, nrow=sum(K), ncol=sum(K))
302+
cumK <- c(0, cumsum(K))
303+
for (m1 in 1:(M - 1))
304+
for (m2 in (m1 + 1):M)
305+
D[(1 + cumK[m1]):cumK[m1 + 1],
306+
(1 + cumK[m2]):cumK[m2 + 1]] <- dst[m1, m2]
307+
# copy to lower-tri
308+
D[lower.tri(D)] <- t(D)[lower.tri(D)]
309+
310+
# initial singleton clusters
311+
cl <- as.list(seq(sum(K)))
312+
313+
allowable <- function (ds)
314+
length(intersect(ms[cl[[ds[1]]]], ms[cl[[ds[2]]]])) == 0
315+
316+
if (verbose) {
317+
fmt <- function (i) paste(ms[cl[[i]]] - 1, ks[cl[[i]]] - 1,
318+
sep=":", collapse=" ")
319+
# emit logging information in form comparable to naive_cluster
320+
blurt <- function (cl1, cl2, d)
321+
message(fmt(cl1), " | ", fmt(cl2), " [", signif(d, 4), "] ",
322+
cl1 - 1, "/", cl2 - 1)
323+
} else
324+
blurt <- function (...) { }
325+
326+
done <- F
327+
while (!done) {
328+
done <- T
329+
for (i in order(D)) {
330+
if (D[i] > threshold || is.na(D[i]))
331+
break
332+
ds <- arrayInd(i, dim(D))
333+
if (allowable(ds)) {
334+
done <- F
335+
break
336+
}
337+
}
338+
if (!done) {
339+
ds <- sort(ds) # ensure ds[1] is the smaller index
340+
blurt(ds[1], ds[2], D[i])
341+
# merge
342+
cl[[ds[1]]] <- c(cl[[ds[1]]], cl[[ds[2]]])
343+
cl[[ds[2]]] <- NULL
344+
D[ds[1], ] <- pmin(D[ds[1], ], D[ds[2], ])
345+
D <- D[-ds[2], -ds[2]]
346+
347+
# TODO heights
348+
}
349+
}
350+
351+
# unravel cl
352+
result_flat <- numeric(sum(K))
353+
for (i in seq_along(cl)) {
354+
result_flat[cl[[i]]] <- i
355+
}
356+
357+
lapply(seq(M), function (m)
358+
result_flat[(1 + cumK[m]):cumK[m + 1]]
359+
)
360+
361+
}
362+
363+

R/browser_export.R

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@ write_dfb_file <- function (txt, f, zip=TRUE,
152152
#' decimal place, yielding a somewhat sparser doc-topics matrix (the topic-word
153153
#' matrix is more aggressively truncated anyway). Set to NULL for no rounding.
154154
#' Rounded weights are renormalized within dfr-browser itself.
155+
#' @param permute if non-NULL, specifies a renumbering of the topics: the new
156+
#' topic \code{k} is old topic \code{permute[k]}. (If you have the inverse, use
157+
#' \code{\link{order}(permute)} to invert it back.)
155158
#'
156159
#' @examples
157160
#'
@@ -185,7 +188,8 @@ export_browser_data <- function (m, out_dir, zipped=TRUE,
185188
internalize=FALSE,
186189
info=NULL,
187190
proper=FALSE,
188-
digits=getOption("digits")) {
191+
digits=getOption("digits"),
192+
permute=NULL) {
189193
if (!requireNamespace("jsonlite", quietly=TRUE)) {
190194
stop("jsonlite package required for browser export. Install from CRAN.")
191195
}
@@ -237,6 +241,12 @@ Set overwrite=TRUE to overwrite existing files."
237241
}
238242
}
239243

244+
# validate permute
245+
if (!is.null(permute) && !identical(sort(permute), 1:n_topics(m))) {
246+
warning("ignoring invalid permute parameter")
247+
permute <- NULL
248+
}
249+
240250
if (proper) {
241251
keys <- top_words(m, n_top_words, tw_smooth_normalize(m))
242252
if (!is.null(keys) && is.numeric(digits)) {
@@ -252,7 +262,8 @@ Set overwrite=TRUE to overwrite existing files."
252262
alpha=hyperparameters(m)$alpha,
253263
digits=digits, # irrelevant unless proper is TRUE
254264
overwrite= overwrite || internalize,
255-
index=index
265+
index=index,
266+
permute
256267
)
257268
} else {
258269
warning("Topic top words unavailable; unable to write tw.json")
@@ -276,7 +287,8 @@ Set overwrite=TRUE to overwrite existing files."
276287
dtm=dtm,
277288
digits=digits, # irrelevant unless proper is TRUE
278289
zip=zipped,
279-
overwrite=overwrite || internalize, index=index
290+
overwrite=overwrite || internalize, index=index,
291+
permute
280292
)
281293
} else {
282294
warning("Document topics unavailable; unable to write dt.json.zip")
@@ -318,7 +330,8 @@ display may not work as expected. See ?export_browser_data for details."
318330
file=paste0(file.path(out_dir, "topic_scaled"), ".csv"),
319331
scaled=topic_scaled_2d(m, n_scaled_words),
320332
overwrite=overwrite || internalize,
321-
index=index
333+
index=index,
334+
permute
322335
)
323336
} else {
324337
warning(
@@ -362,13 +375,20 @@ display may not work as expected. See ?export_browser_data for details."
362375
#' @param overwrite clobber existing file?
363376
#' @param index if non-NULL, output is assumed to go into an element with ID
364377
#' \code{m__DATA__tw} in an HTML file at this path. \code{file} is ignored.
378+
#' @param permute if non-NULL, exported topic \code{k} will correspond to the
379+
#' topic numbered \code{permute[k]} in the data
365380
#'
366381
#' @seealso \code{\link{export_browser_data}} for a more automated export of
367382
#' all model information at once
368383
#' @export
369384
#'
370385
export_browser_topic_words <- function (file, keys, alpha, digits=4,
371-
overwrite, index) {
386+
overwrite, index,
387+
permute) {
388+
if (!is.null(permute)) {
389+
keys$topic <- match(keys$topic, permute)
390+
alpha <- alpha[permute]
391+
}
372392
keys <- dplyr::arrange_(keys, ~ topic, ~ desc(weight))
373393
n_top_words <- nrow(keys) / length(alpha)
374394
if (!is.null(index)) {
@@ -403,14 +423,19 @@ export_browser_topic_words <- function (file, keys, alpha, digits=4,
403423
#' @param overwrite clobber existing file?
404424
#' @param index if non-NULL, output is assumed to go into an element with ID
405425
#' \code{m__DATA__dt} in an HTML file at this path. \code{file} is ignored.
426+
#' @param permute if non-NULL, exported topic \code{k} will correspond to the
427+
#' topic numbered \code{permute[k]} in the data
406428
#'
407429
#' @seealso \code{\link{export_browser_data}} for a more automated export of
408430
#' all model information at once
409431
#' @export
410432
#'
411433
export_browser_doc_topics <- function (file, dtm, digits=4,
412-
zipped, overwrite, index) {
434+
zipped, overwrite, index, permute) {
413435
dtm <- as(dtm, "CsparseMatrix")
436+
if (!is.null(permute)) {
437+
dtm <- dtm[ , permute]
438+
}
414439
if (!is.null(index)) {
415440
file <- "dt.json"
416441
}
@@ -467,15 +492,21 @@ export_browser_metadata <- function (file, meta, zipped, overwrite, index) {
467492
#' @param index if non-NULL, output is assumed to go into an element with ID
468493
#' \code{m__DATA__topic_scaled} in an HTML file at this path. \code{file} is
469494
#' ignored.
495+
#' @param permute if non-NULL, exported topic \code{k} will correspond to the
496+
#' topic numbered \code{permute[k]} in the data
470497
#'
471498
#' @seealso \code{\link{export_browser_data}} for a more automated export of
472499
#' all model information at once
473500
#' @export
474501
#'
475-
export_browser_topic_scaled <- function (file, scaled, overwrite, index) {
502+
export_browser_topic_scaled <- function (file, scaled, overwrite, index,
503+
permute) {
476504
if (!is.null(index)) {
477505
file <- "topic_scaled.csv"
478506
}
507+
if (!is.null(permute)) {
508+
scaled <- scaled[permute]
509+
}
479510
write_dfb_file(capture.output(
480511
write.table(scaled, quote=FALSE, sep=",", row.names=FALSE,
481512
col.names=FALSE)
@@ -547,8 +578,8 @@ export_browser_info <- function (file, info, overwrite, index) {
547578
#' @param internalize if TRUE, model data is in the browser home page rather
548579
#' than separate files. See Details.
549580
#' @param ... passed on to \code{\link{export_browser_data}}, q.v., especially
550-
#' the parameters \code{overwrite}, \code{n_scaled_words}, \code{info}, and
551-
#' \code{proper}
581+
#' the parameters \code{overwrite}, \code{n_scaled_words}, \code{info},
582+
#' \code{proper}, and \code{permute}
552583
#'
553584
#' @seealso \code{\link{export_browser_data}} which does the work of exporting
554585
#' files, \code{\link{model_dfr_documents}}, \code{\link{train_model}},
@@ -561,6 +592,14 @@ export_browser_info <- function (file, info, overwrite, index) {
561592
#' "stoplist.txt", n_topics=40)
562593
#' # launch browser
563594
#' dfr_browser(m)
595+
#'
596+
#' # generate a second model and align its topics with the first for more
597+
#' # convenient comparisons
598+
#' m2 <- model_dfr_documents("citations.CSV", "wordcounts",
599+
#' "stoplist.txt", n_topics=40)
600+
#' cl <- model_distances(list(m, m2), n_words=40) %>% align_topics()
601+
#' dfr_browser(m2, permute=match(cl$clusters[[1]], cl$clusters[[2]])))
602+
#'
564603
#' }
565604
#'
566605
#' @export

man/dfr_browser.Rd

Lines changed: 10 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/export_browser_data.Rd

Lines changed: 5 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/export_browser_doc_topics.Rd

Lines changed: 5 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/export_browser_topic_scaled.Rd

Lines changed: 4 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/export_browser_topic_words.Rd

Lines changed: 5 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/widths.Rd

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)