agoldst
diff --git a/‎R/align.R‎
Lines changed: 87 additions & 4 deletions b/‎R/align.R‎
Lines changed: 87 additions & 4 deletions
diff --git a/‎R/browser_export.R‎
Lines changed: 48 additions & 9 deletions b/‎R/browser_export.R‎
Lines changed: 48 additions & 9 deletions
diff --git a/‎man/dfr_browser.Rd‎
Lines changed: 10 additions & 2 deletions b/‎man/dfr_browser.Rd‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎man/export_browser_data.Rd‎
Lines changed: 5 additions & 1 deletion b/‎man/export_browser_data.Rd‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎man/export_browser_doc_topics.Rd‎
Lines changed: 5 additions & 1 deletion b/‎man/export_browser_doc_topics.Rd‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎man/export_browser_topic_scaled.Rd‎
Lines changed: 4 additions & 1 deletion b/‎man/export_browser_topic_scaled.Rd‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎man/export_browser_topic_words.Rd‎
Lines changed: 5 additions & 1 deletion b/‎man/export_browser_topic_words.Rd‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎man/widths.Rd‎
Lines changed: 2 additions & 2 deletions b/‎man/widths.Rd‎
Lines changed: 2 additions & 2 deletions
@@ -203,10 +203,12 @@ derive one from a list of models.")
     }
 
     cl <- naive_cluster(dst_flat, K, threshold)
+
     structure(
         list(
-            # naive_cluster numbers clusters from 0
-             clusters=lapply(cl$clusters, `+`, 1),
+            # relabel clusters as sequential numbers from 1
+             clusters=lapply(cl$clusters, match,
+                 sort(unique(unlist(cl$clusters)))),
              distances=cl$distances,
              model_distances=dst,
              threshold=threshold
@@ -262,8 +264,8 @@ alignment_frame <- function (clusters) {
 #'
 #' @param x result from \code{\link{align_topics}}
 #'
-#' @return a vector whose \code{i}th element is the width of cluster \code{i}. 
-#' If there is no cluster with that number, the corresponding element is 
+#' @return a vector whose \code{i}th element is the width of cluster \code{i}.
+#' If there is no cluster with that number, the corresponding element is
 #' \code{NA}. Single-member clusters have a width of zero.
 #'
 #' @seealso \code{\link{align_topics}}, \code{\link{alignment_frame}}
@@ -278,3 +280,84 @@ widths.topic_alignment <- function (x) {
     )
 }
 
+# The naivest cluster algorithm
+#
+# For testing purposes, this function implements the single-linkage clustering
+# algorithm described in
+# \href{https://en.wikipedia.org/wiki/Single-linkage_clustering}{Wikipedia}.
+# It should yield the same clustering as \code{\link{align_topics}} (for the
+# sketch of a proof, see comments on the source code in \code{cluster.cpp}).
+#
+naivest_cluster <- function (dst, threshold=Inf, verbose=FALSE) {
+    K <- c(nrow(dst$d[[1]][[1]]),
+           vapply(dst$d[[1]], ncol, integer(1)))
+    M <- length(K)
+    # model membership indicator for topic sequence
+    ms <- rep(seq_along(K), times=K)
+    # topic indicator
+    ks <- do.call(c, lapply(K, seq))
+
+    # construct upper-tri distance matrix D (probably faster ways to do this)
+    D <- matrix(NA, nrow=sum(K), ncol=sum(K))
+    cumK <- c(0, cumsum(K))
+    for (m1 in 1:(M - 1))
+        for (m2 in (m1 + 1):M)
+            D[(1 + cumK[m1]):cumK[m1 + 1],
+              (1 + cumK[m2]):cumK[m2 + 1]] <- dst[m1, m2]
+    # copy to lower-tri
+    D[lower.tri(D)] <- t(D)[lower.tri(D)]
+
+    # initial singleton clusters
+    cl <- as.list(seq(sum(K)))
+
+    allowable <- function (ds)
+        length(intersect(ms[cl[[ds[1]]]], ms[cl[[ds[2]]]])) == 0
+
+    if (verbose) {
+        fmt <- function (i) paste(ms[cl[[i]]] - 1, ks[cl[[i]]] - 1,
+                                  sep=":", collapse=" ")
+        # emit logging information in form comparable to naive_cluster
+        blurt <- function (cl1, cl2, d)
+            message(fmt(cl1), " | ", fmt(cl2), " [", signif(d, 4), "] ",
+                    cl1 - 1, "/", cl2 - 1)
+    } else
+        blurt <- function (...) { }
+
+    done <- F
+    while (!done) {
+        done <- T
+        for (i in order(D)) {
+            if (D[i] > threshold || is.na(D[i]))
+                break
+            ds <- arrayInd(i, dim(D))
+            if (allowable(ds)) {
+                done <- F
+                break
+            }
+        }
+        if (!done) {
+            ds <- sort(ds) # ensure ds[1] is the smaller index
+            blurt(ds[1], ds[2], D[i])
+            # merge
+            cl[[ds[1]]] <- c(cl[[ds[1]]], cl[[ds[2]]])
+            cl[[ds[2]]] <- NULL
+            D[ds[1], ] <- pmin(D[ds[1], ], D[ds[2], ])
+            D <- D[-ds[2], -ds[2]]
+
+            # TODO heights
+        }
+    }
+
+    # unravel cl
+    result_flat <- numeric(sum(K))
+    for (i in seq_along(cl)) {
+        result_flat[cl[[i]]] <- i
+    }
+
+    lapply(seq(M), function (m)
+        result_flat[(1 + cumK[m]):cumK[m + 1]]
+    )
+
+}
+
+
@@ -152,6 +152,9 @@ write_dfb_file <- function (txt, f, zip=TRUE,
 #' decimal place, yielding a somewhat sparser doc-topics matrix (the topic-word
 #' matrix is more aggressively truncated anyway). Set to NULL for no rounding.
 #' Rounded weights are renormalized within dfr-browser itself.
+#' @param permute if non-NULL, specifies a renumbering of the topics: the new
+#' topic \code{k} is old topic \code{permute[k]}. (If you have the inverse, use
+#' \code{\link{order}(permute)} to invert it back.)
 #'
 #' @examples
 #'
@@ -185,7 +188,8 @@ export_browser_data <- function (m, out_dir, zipped=TRUE,
                                  internalize=FALSE,
                                  info=NULL,
                                  proper=FALSE,
-                                 digits=getOption("digits")) {
+                                 digits=getOption("digits"),
+                                 permute=NULL) {
     if (!requireNamespace("jsonlite", quietly=TRUE)) {
         stop("jsonlite package required for browser export. Install from CRAN.")
     }
@@ -237,6 +241,12 @@ Set overwrite=TRUE to overwrite existing files."
         }
     }
 
+    # validate permute
+    if (!is.null(permute) && !identical(sort(permute), 1:n_topics(m))) {
+        warning("ignoring invalid permute parameter")
+        permute <- NULL
+    }
+
     if (proper) {
         keys <- top_words(m, n_top_words, tw_smooth_normalize(m))
         if (!is.null(keys) && is.numeric(digits)) {
@@ -252,7 +262,8 @@ Set overwrite=TRUE to overwrite existing files."
             alpha=hyperparameters(m)$alpha,
             digits=digits,  # irrelevant unless proper is TRUE
             overwrite= overwrite || internalize,
-            index=index
+            index=index,
+            permute
         )
     } else {
         warning("Topic top words unavailable; unable to write tw.json")
@@ -276,7 +287,8 @@ Set overwrite=TRUE to overwrite existing files."
             dtm=dtm, 
             digits=digits,  # irrelevant unless proper is TRUE
             zip=zipped,
-            overwrite=overwrite || internalize, index=index
+            overwrite=overwrite || internalize, index=index,
+            permute
         ) 
     } else {
         warning("Document topics unavailable; unable to write dt.json.zip")
@@ -318,7 +330,8 @@ display may not work as expected. See ?export_browser_data for details."
             file=paste0(file.path(out_dir, "topic_scaled"), ".csv"),
             scaled=topic_scaled_2d(m, n_scaled_words),
             overwrite=overwrite || internalize,
-            index=index
+            index=index,
+            permute
         )
     } else {
         warning(
@@ -362,13 +375,20 @@ display may not work as expected. See ?export_browser_data for details."
 #' @param overwrite clobber existing file?
 #' @param index if non-NULL, output is assumed to go into an element with ID
 #' \code{m__DATA__tw} in an HTML file at this path. \code{file} is ignored.
+#' @param permute if non-NULL, exported topic \code{k} will correspond to the
+#' topic numbered \code{permute[k]} in the data
 #'
 #' @seealso \code{\link{export_browser_data}} for a more automated export of
 #' all model information at once
 #' @export
 #' 
 export_browser_topic_words <- function (file, keys, alpha, digits=4,
-                                        overwrite, index) {
+                                        overwrite, index,
+                                        permute) {
+    if (!is.null(permute)) {
+        keys$topic <- match(keys$topic, permute)
+        alpha <- alpha[permute]
+    }
     keys <- dplyr::arrange_(keys, ~ topic, ~ desc(weight))
     n_top_words <- nrow(keys) / length(alpha)
     if (!is.null(index)) {
@@ -403,14 +423,19 @@ export_browser_topic_words <- function (file, keys, alpha, digits=4,
 #' @param overwrite clobber existing file?
 #' @param index if non-NULL, output is assumed to go into an element with ID
 #' \code{m__DATA__dt} in an HTML file at this path. \code{file} is ignored.
+#' @param permute if non-NULL, exported topic \code{k} will correspond to the
+#' topic numbered \code{permute[k]} in the data
 #'
 #' @seealso \code{\link{export_browser_data}} for a more automated export of
 #' all model information at once
 #' @export
 #'
 export_browser_doc_topics <- function (file, dtm, digits=4,
-                                       zipped, overwrite, index) { 
+                                       zipped, overwrite, index, permute) { 
     dtm <- as(dtm, "CsparseMatrix")
+    if (!is.null(permute)) {
+        dtm <- dtm[ , permute]
+    }
     if (!is.null(index)) {
         file <- "dt.json"
     }
@@ -467,15 +492,21 @@ export_browser_metadata <- function (file, meta, zipped, overwrite, index) {
 #' @param index if non-NULL, output is assumed to go into an element with ID
 #' \code{m__DATA__topic_scaled} in an HTML file at this path. \code{file} is
 #' ignored.
+#' @param permute if non-NULL, exported topic \code{k} will correspond to the
+#' topic numbered \code{permute[k]} in the data
 #'
 #' @seealso \code{\link{export_browser_data}} for a more automated export of
 #' all model information at once
 #' @export
 #'
-export_browser_topic_scaled <- function (file, scaled, overwrite, index) {
+export_browser_topic_scaled <- function (file, scaled, overwrite, index,
+                                         permute) {
     if (!is.null(index)) {
         file <- "topic_scaled.csv"
     }
+    if (!is.null(permute)) {
+        scaled <- scaled[permute]
+    }
     write_dfb_file(capture.output(
         write.table(scaled, quote=FALSE, sep=",", row.names=FALSE,
                     col.names=FALSE)
@@ -547,8 +578,8 @@ export_browser_info <- function (file, info, overwrite, index) {
 #' @param internalize if TRUE, model data is in the browser home page rather
 #'   than separate files. See Details.
 #' @param ... passed on to \code{\link{export_browser_data}}, q.v., especially
-#' the parameters \code{overwrite}, \code{n_scaled_words}, \code{info}, and
-#' \code{proper}
+#' the parameters \code{overwrite}, \code{n_scaled_words}, \code{info}, 
+#' \code{proper}, and \code{permute}
 #'
 #' @seealso \code{\link{export_browser_data}} which does the work of exporting
 #'   files, \code{\link{model_dfr_documents}}, \code{\link{train_model}},
@@ -561,6 +592,14 @@ export_browser_info <- function (file, info, overwrite, index) {
 #'     "stoplist.txt", n_topics=40)
 #' # launch browser
 #' dfr_browser(m)
+#'
+#' # generate a second model and align its topics with the first for more
+#' # convenient comparisons
+#' m2 <- model_dfr_documents("citations.CSV", "wordcounts",
+#'     "stoplist.txt", n_topics=40)
+#' cl <- model_distances(list(m, m2), n_words=40) %>% align_topics()
+#' dfr_browser(m2, permute=match(cl$clusters[[1]], cl$clusters[[2]])))
+#' 
 #' }
 #'
 #' @export