RBigData
diff --git a/‎ChangeLog‎
Lines changed: 3 additions & 0 deletions b/‎ChangeLog‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 5 additions & 5 deletions b/‎DESCRIPTION‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/00_pmclust_reduceK.r‎
Lines changed: 100 additions & 0 deletions b/‎R/00_pmclust_reduceK.r‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎R/00_pmclust_reduceK_dmat.r‎
Lines changed: 81 additions & 0 deletions b/‎R/00_pmclust_reduceK_dmat.r‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎R/dmat_em_base.r‎
Lines changed: 26 additions & 8 deletions b/‎R/dmat_em_base.r‎
Lines changed: 26 additions & 8 deletions
diff --git a/‎R/pm_aecm_base.r‎
Lines changed: 25 additions & 8 deletions b/‎R/pm_aecm_base.r‎
Lines changed: 25 additions & 8 deletions
diff --git a/‎R/pm_apecm_base.r‎
Lines changed: 5 additions & 1 deletion b/‎R/pm_apecm_base.r‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎R/pm_apecma_base.r‎
Lines changed: 25 additions & 8 deletions b/‎R/pm_apecma_base.r‎
Lines changed: 25 additions & 8 deletions
@@ -1,3 +1,6 @@
+2018-02-01: Ver. 0.2-0
+  * Add ‘R_registerRoutines’, ‘R_useDynamicSymbols’.
+
 2016-12-17: Ver. 0.1-8
   * Change web address.
 
 
@@ -1,6 +1,6 @@
 Package: pmclust
-Version: 0.1-9
-Date: 2016-12-17
+Version: 0.2-0
+Date: 2018-02-01
 Title: Parallel Model-Based Clustering using
         Expectation-Gathering-Maximization Algorithm for Finite Mixture
         Gaussian Model
@@ -21,9 +21,9 @@ Description: Aims to utilize model-based clustering (unsupervised)
         Gaussian models. The implementation is default in the single program
         multiple data programming model. The code can be executed
         through pbdMPI and independent to most MPI applications.
-        See the High Performance
-        Statistical Computing website for more information, documents
-        and examples.
+        See the High Performance Statistical Computing website
+	<https://snoweye.github.io/hpsc/>
+	for more information, documents and examples.
 License: GPL (>= 2)
 URL: http://r-pbd.org/
 BugReports: http://group.r-pbd.org/
 
@@ -26,6 +26,7 @@ importFrom(pbdMPI,get.jid)
 export(
 ### General functions.
   "pmclust",
+  "pmclust.reduceK",
   "pkmeans",
   "as.dmat",
   "as.spmd",
 
@@ -0,0 +1,100 @@
+### For automatically reducing K methods.
+
+### X should be in spmd, gbd, or dmat and set at .pmclustEnv or so, as used
+### in pmclust().
+pmclust.reduceK <- function(K = 2, algorithm = .PMC.CT$algorithm){
+  if(any(algorithm[1] %in% c("kmeans", "kmeans.dmat"))){
+    stop("kmeans/pkmeans is not supported in reduceK.")
+  }
+
+  if(algorithm[1] %in% .PMC.CT$algorithm.gbd){
+    ret <- pmclust.reduceK.spmd(K = K, algorithm = algorithm)
+  } else if(algorithm[1] %in% .PMC.CT$algorithm.dmat){
+    ret <- pmclust.reduceK.dmat(K = K, algorithm = algorithm)
+  } else{
+    comm.stop("The algorithm is not found.")
+  }
+
+  ret
+} # End of pmclust.reduceK().
+
+
+pmclust.reduceK.spmd <- function(K = 2, algorithm = .PMC.CT$algorithm){
+  # Get an initial start.
+  PARAM.org <- set.global(K = K)
+  PARAM.org <- try(initial.em(PARAM.org), silent = TRUE)
+
+  # Ensure the initial is good. Warning: This may take forever to run!
+  repeat{
+    if(class(PARAM.org) == "try-error"){
+      PARAM.org <- set.global(K = K)
+      PARAM.org <- try(initial.em(PARAM.org), silent = TRUE)
+    } else{
+      break
+    }
+  }
+
+  # Update steps.
+  method.step <- switch(algorithm[1],
+                        "em" = em.step,
+                        "aecm" = aecm.step,
+                        "apecm" = apecm.step,
+                        "apecma" = apecma.step,
+                        NULL)
+  if(comm.all(is.null(method.step))){
+    comm.stop("Algorithm is not found.")
+  }
+  PARAM.new <- try(method.step(PARAM.org), silent = TRUE)
+  em.update.class()
+  N.CLASS <- get.N.CLASS(K)
+
+
+  # Reduce K if error occurs.
+  repeat{
+    if((class(PARAM.new) == "try-error" ||
+        .pmclustEnv$CHECK$convergence == 99) &&
+       K > 1){
+      # Drop specific i.k if available or
+      # drop the smallest class or
+      # drop the class with the smallest eta among all small classes or
+      # drop all classes with 0 elements.
+      if(.pmclustEnv$CONTROL$stop.at.fail && .pmclustEnv$FAIL.i.k > 0){
+        i.k <- .pmclustEnv$FAIL.i.k
+      } else{
+        i.k <- which(N.CLASS == min(N.CLASS))
+      }
+      if(i.k > 1 && min(N.CLASS) > 0){
+        i.k <- i.k[which.min(PARAM.new$ETA[i.k])]
+      }
+      K <- K - length(i.k)
+      comm.cat("- Reduce: ", K, "\n")
+
+      # Initial global storage.
+      PARAM.org <- set.global(K = K)
+
+      # Replacing PARAM.org by previous PARAM.new.
+      PARAM.org$ETA <- PARAM.new$ETA[-i.k] / sum(PARAM.new$ETA[-i.k])
+      PARAM.org$log.ETA <- log(PARAM.org$ETA)
+      PARAM.org$MU <- matrix(PARAM.new$MU[, -i.k], ncol = K)
+      PARAM.org$SIGMA <- PARAM.new$SIGMA[-i.k]
+
+      # Update steps.
+      e.step.spmd(PARAM.org)
+      PARAM.new <- try(method.step(PARAM.org), silent = TRUE)
+      em.update.class()
+      N.CLASS <- get.N.CLASS(K)
+    } else{
+      break
+    }
+  }
+
+  # For return.
+  ret <- list(algorithm = algorithm[1],
+              param = PARAM.new,
+              class = .pmclustEnv$CLASS.spmd,
+              n.class = N.CLASS,
+              check = .pmclustEnv$CHECK)
+
+  ret
+} # End of pmclust.reduceK.spmd().
+
@@ -0,0 +1,81 @@
+### A dmat version for automatically reducing K methods.
+
+pmclust.reduceK.dmat <- function(K = 2, algorithm = .PMC.CT$algorithm){
+  # Get an initial start.
+  PARAM.org <- set.global.dmat(K = K)
+  PARAM.org <- try(initial.em.dmat(PARAM.org), silent = TRUE)
+
+  # Ensure the initial is good. Warning: This may take forever to run!
+  repeat{
+    if(class(PARAM.org) == "try-error"){
+      PARAM.org <- set.global.dmat(K = K)
+      PARAM.org <- try(initial.em.dmat(PARAM.org), silent = TRUE)
+    } else{
+      break
+    }
+  }
+
+  # Update steps.
+  method.step <- switch(algorithm[1],
+                        "em.dmat" = em.step.dmat,
+                        # "aecm.dmat" = aecm.step.dmat,
+                        # "apecm.dmat" = apecm.step.dmat,
+                        # "apecma.dmat" = apecma.step.dmat,
+                        NULL)
+  if(comm.all(is.null(method.step))){
+    comm.stop("Algorithm is not found.")
+  }
+  PARAM.new <- try(method.step(PARAM.org), silent = TRUE)
+  em.update.class.dmat()
+  N.CLASS <- get.N.CLASS.dmat(K)
+
+
+  # Reduce K if error occurs.
+  repeat{
+    if((class(PARAM.new) == "try-error" ||
+        .pmclustEnv$CHECK$convergence == 99) &&
+       K > 1){
+      # Drop specific i.k if available or
+      # drop the smallest class or
+      # drop the class with the smallest eta among all small classes or
+      # drop all classes with 0 elements.
+      if(.pmclustEnv$CONTROL$stop.at.fail && .pmclustEnv$FAIL.i.k > 0){
+        i.k <- .pmclustEnv$FAIL.i.k
+      } else{
+        i.k <- which(N.CLASS == min(N.CLASS))
+      }
+      if(i.k > 1 && min(N.CLASS) > 0){
+        i.k <- i.k[which.min(PARAM.new$ETA[i.k])]
+      }
+      K <- K - length(i.k)
+      comm.cat("- Reduce: ", K, "\n")
+
+      # Initial global storage.
+      PARAM.org <- set.global.dmat(K = K)
+
+      # Replacing PARAM.org by previous PARAM.new.
+      PARAM.org$ETA <- PARAM.new$ETA[-i.k] / sum(PARAM.new$ETA[-i.k])
+      PARAM.org$log.ETA <- log(PARAM.org$ETA)
+      PARAM.org$MU <- matrix(PARAM.new$MU[, -i.k], ncol = K)
+      PARAM.org$SIGMA <- PARAM.new$SIGMA[-i.k]
+
+      # Update steps.
+      e.step.dmat(PARAM.org)
+      PARAM.new <- try(method.step(PARAM.org), silent = TRUE)
+      em.update.class.dmat()
+      N.CLASS <- get.N.CLASS.dmat(K)
+    } else{
+      break
+    }
+  }
+
+  # For return.
+  ret <- list(algorithm = algorithm[1],
+              param = PARAM.new,
+              class = .pmclustEnv$CLASS.spmd,
+              n.class = N.CLASS,
+              check = .pmclustEnv$CHECK)
+
+  ret
+} # End of pmclust.reduceK.dmat().
+
@@ -212,17 +212,30 @@ m.step.dmat <- function(PARAM){
       tmp.1 <- crossprod(B)
       tmp.2 <- as.matrix(tmp.1)
       tmp.SIGMA <- tmp.2 
-      dim(tmp.SIGMA) <- c(p, p)
 
-      tmp.U <- decompsigma(tmp.SIGMA)
-      PARAM$U.check[[i.k]] <- tmp.U$check
-      if(tmp.U$check){
-        PARAM$U[[i.k]] <- tmp.U$value
-        PARAM$SIGMA[[i.k]] <- tmp.SIGMA
+      if(!any(is.nan(tmp.SIGMA))){
+        dim(tmp.SIGMA) <- c(p, p)
+
+        tmp.U <- decompsigma(tmp.SIGMA)
+        PARAM$U.check[[i.k]] <- tmp.U$check
+        if(tmp.U$check){
+          PARAM$U[[i.k]] <- tmp.U$value
+          PARAM$SIGMA[[i.k]] <- tmp.SIGMA
+        }
+      } else{
+        PARAM$U.check[[i.k]] <- FALSE
+        if(.pmclustEnv$CONTROL$debug > 2){
+          comm.cat("  SIGMA[[", i.k, "]] has NaN. Updating is skipped.\n", sep = "", quiet = TRUE)
+        }
+
+        .pmclustEnv$FAIL.i.k <- i.k    # i.k is failed to update.
+        if(.pmclustEnv$CONTROL$stop.at.fail){
+          stop(paste("NaN occurs at i.k=", i.k, sep = ""))
+        }
       }
     } else{
       if(.pmclustEnv$CONTROL$debug > 2){
-        comm.cat("  SIGMA[[", i.k, "]] is fixed.\n", sep = "", quiet = TRUE)
+        comm.cat("  SIGMA[[", i.k, "]] is fixed. Updating is skipped.\n", sep = "", quiet = TRUE)
       }
     }
   }
@@ -262,11 +275,16 @@ em.step.dmat <- function(PARAM.org){
       time.start <- proc.time()
     }
 
+    ### This is used to record which i.k may be failed to update.
+    .pmclustEnv$FAIL.i.k <- 0
+
+    ### Start EM next in DMAT format.
+
     ### WCC: original
     PARAM.new <- try(em.onestep.dmat(PARAM.org))
     ### WCC: temp
     # PARAM.new <- em.onestep.dmat(PARAM.org)
-    if(comm.any(class(PARAM.new) == "try-error")){
+    if(class(PARAM.new) == "try-error" || is.nan(PARAM.new$logL)){
       comm.cat("Results of previous iterations are returned.\n", quiet = TRUE)
       .pmclustEnv$CHECK$convergence <- 99
       PARAM.new <- PARAM.org
 
@@ -40,17 +40,30 @@ cm.step.spmd.SIGMA <- function(PARAM){
            sqrt(.pmclustEnv$Z.spmd[, i.k] / .pmclustEnv$Z.colSums[i.k])
       tmp.SIGMA <- crossprod(B)
       tmp.SIGMA <- spmd.allreduce.double(tmp.SIGMA, double(p.2), op = "sum") 
-      dim(tmp.SIGMA) <- c(p, p)
 
-      tmp.U <- decompsigma(tmp.SIGMA)
-      PARAM$U.check[[i.k]] <- tmp.U$check
-      if(tmp.U$check){
-        PARAM$U[[i.k]] <- tmp.U$value
-        PARAM$SIGMA[[i.k]] <- tmp.SIGMA
+      if(!any(is.nan(tmp.SIGMA))){
+        dim(tmp.SIGMA) <- c(p, p)
+
+        tmp.U <- decompsigma(tmp.SIGMA)
+        PARAM$U.check[[i.k]] <- tmp.U$check
+        if(tmp.U$check){
+          PARAM$U[[i.k]] <- tmp.U$value
+          PARAM$SIGMA[[i.k]] <- tmp.SIGMA
+        }
+      } else{
+        PARAM$U.check[[i.k]] <- FALSE
+        if(.pmclustEnv$CONTROL$debug > 2){
+          comm.cat("  SIGMA[[", i.k, "]] has NaN. Updating is skipped\n", sep = "", quiet = TRUE)
+        }
+
+        .pmclustEnv$FAIL.i.k <- i.k    # i.k is failed to update.
+        if(.pmclustEnv$CONTROL$stop.at.fail){
+          stop(paste("NaN occurs at i.k=", i.k, sep = ""))
+        }
       }
     } else{
       if(.pmclustEnv$CONTROL$debug > 2){
-        comm.cat("  SIGMA[[", i.k, "]] is fixed.\n", sep = "")
+        comm.cat("  SIGMA[[", i.k, "]] is fixed. Updating is skipped\n", sep = "")
       }
     }
   }
@@ -84,8 +97,12 @@ aecm.step.spmd <- function(PARAM.org){
       time.start <- proc.time()
     }
 
+    ### This is used to record which i.k may be failed to update.
+    .pmclustEnv$FAIL.i.k <- 0
+
+    ### Start AECM here.
     PARAM.new <- try(aecm.onestep.spmd(PARAM.org))
-    if(comm.any(class(PARAM.new) == "try-error")){
+    if(class(PARAM.new) == "try-error" || is.nan(PARAM.new$logL)){
       comm.cat("Results of previous iterations are returned.\n", quiet = TRUE)
       .pmclustEnv$CHECK$convergence <- 99
       PARAM.new <- PARAM.org
 
@@ -133,8 +133,12 @@ apecm.step.spmd <- function(PARAM.org){
       time.start <- proc.time()
     }
 
+    ### This is used to record which i.k may be failed to update.
+    .pmclustEnv$FAIL.i.k <- 0
+
+    ### Start APECM here.
     PARAM.new <- try(apecm.onestep.spmd(PARAM.org))
-    if(comm.any(class(PARAM.new) == "try-error")){
+    if(class(PARAM.new) == "try-error" || is.nan(PARAM.new$logL)){
       comm.cat("Results of previous iterations are returned.\n", quiet = TRUE)
       .pmclustEnv$CHECK$convergence <- 99
       PARAM.new <- PARAM.org
 
@@ -37,17 +37,30 @@ cm.step.spmd.MU.SIGMA.k <- function(PARAM, i.k){
          sqrt(.pmclustEnv$Z.spmd[, i.k] / .pmclustEnv$Z.colSums[i.k])
     tmp.SIGMA <- crossprod(B)
     tmp.SIGMA <- spmd.allreduce.double(tmp.SIGMA, double(p.2), op = "sum") 
-    dim(tmp.SIGMA) <- c(p, p)
 
-    tmp.U <- decompsigma(tmp.SIGMA)
-    PARAM$U.check[[i.k]] <- tmp.U$check
-    if(tmp.U$check){
-      PARAM$U[[i.k]] <- tmp.U$value
-      PARAM$SIGMA[[i.k]] <- tmp.SIGMA
+    if(!any(is.nan(tmp.SIGMA))){
+      dim(tmp.SIGMA) <- c(p, p)
+
+      tmp.U <- decompsigma(tmp.SIGMA)
+      PARAM$U.check[[i.k]] <- tmp.U$check
+      if(tmp.U$check){
+        PARAM$U[[i.k]] <- tmp.U$value
+        PARAM$SIGMA[[i.k]] <- tmp.SIGMA
+      }
+    } else{
+      PARAM$U.check[[i.k]] <- FALSE
+      if(.pmclustEnv$CONTROL$debug > 2){
+        comm.cat("  SIGMA[[", i.k, "]] has NaN. Updating is skipped.\n", sep = "", quiet = TRUE)
+      }
+
+      .pmclustEnv$FAIL.i.k <- i.k    # i.k is failed to update.
+      if(.pmclustEnv$CONTROL$stop.at.fail){
+        stop(paste("NaN occurs at i.k=", i.k, sep = ""))
+      }
     }
   } else{
     if(.pmclustEnv$CONTROL$debug > 2){
-      comm.cat("  SIGMA[[", i.k, "]] is fixed.\n", sep = "", quiet = TRUE)
+      comm.cat("  SIGMA[[", i.k, "]] is fixed. Updating is skipped.\n", sep = "", quiet = TRUE)
     }
   }
 
@@ -79,8 +92,12 @@ apecma.step.spmd <- function(PARAM.org){
       time.start <- proc.time()
     }
 
+    ### This is used to record which i.k may be failed to update.
+    .pmclustEnv$FAIL.i.k <- 0
+
+    ### Start APECMA here.
     PARAM.new <- try(apecma.onestep.spmd(PARAM.org))
-    if(comm.any(class(PARAM.new) == "try-error")){
+    if(class(PARAM.new) == "try-error" || is.nan(PARAM.new$logL)){
       comm.cat("Results of previous iterations are returned.\n", quiet =TRUE)
       .pmclustEnv$CHECK$convergence <- 99
       PARAM.new <- PARAM.org