Bayesian-Graphical-Modelling-Lab · MaartenMarsman · Dec 9, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 6, 2024
diff --git a/R/bgm.R b/R/bgm.R
@@ -111,12 +111,14 @@
 #' model \code{edge_prior = "Stochastic-Block"} assumes that nodes can be
 #' organized into blocks or clusters. In principle, the assignment of nodes to
 #' such clusters is unknown, and the model as implemented here considers all
-#' possible options \insertCite{@i.e., specifies a Dirichlet process on the node to block
-#' allocation as described by @GengEtAl_2019}{bgms}. This model is advantageous
+#' possible options \insertCite{@i.e., specifies a Dirichlet prior on the probability
+#' of allocations as described by @GengEtAl_2019}{bgms}. This model is advantageous
 #' when nodes are expected to fall into distinct clusters. The inclusion
 #' probabilities for the edges are defined at the level of the clusters, with a
 #' beta prior for the unknown inclusion probability with shape parameters
-#' \code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}. The default is
+#' \code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, and a Dirichlet
+#' prior on the cluster assignment probabilities with a common concentration
+#' parameter \code{dirichlet_alpha}. The default is
 #' \code{edge_prior = "Bernoulli"}.
 #' @param inclusion_probability The prior edge inclusion probability for the
 #' Bernoulli model. Can be a single probability, or a matrix of \code{p} rows
@@ -127,7 +129,7 @@
 #' positive numbers. Defaults to \code{beta_bernoulli_alpha = 1} and
 #' \code{beta_bernoulli_beta = 1}.
 #' @param dirichlet_alpha The shape of the Dirichlet prior on the node-to-block
-#' allocation parameters for the Stochastic Block model.
+#' allocation probabilities for the Stochastic Block model.
 #' @param na.action How do you want the function to handle missing data? If
 #' \code{na.action = "listwise"}, listwise deletion is used. If
 #' \code{na.action = "impute"}, missing data are imputed iteratively during the
@@ -156,8 +158,14 @@
 #' ``blume-capel'' variables, the first entry is the parameter for the linear
 #' effect and the second entry is the parameter for the quadratic effect, which
 #' models the offset to the reference category.
+#' \item In the case of
+#' \code{edge_prior = "Stochastic-Block"} two additional elements are returned:
+#' a vector \code{allocations} with the estimated cluster assignments of the nodes and an
+#' table \code{clusters} with the estimated posterior probability of the number
+#' of clusters in the network. The vector of node allocations is calculated using
+#' a method proposed by \insertCite{Dahl2009}{bgms} and also used by
+#' \insertCite{GengEtAl_2019}{bgms}.
 #' }
-#'
 #' If \code{save = TRUE}, the result is a list of class ``bgms'' containing:
 #' \itemize{
 #' \item \code{indicator}: A matrix with \code{iter} rows and
@@ -169,8 +177,15 @@
 #' \item \code{thresholds}: A matrix with \code{iter} rows and
 #' \code{sum(m)} columns, containing parameter states from every iteration of
 #' the Gibbs sampler for the category thresholds.
+#' \item In the case of
+#' \code{edge_prior = "Stochastic-Block"} a matrix \code{allocations} with the
+#' cluster assignments of the nodes from each iteration is returned. This matrix
+#' can be used to calculate the posterior probability of the number of clusters
+#' by utilizing the \code{summary_SBM(bgm_output[["allocations"]])} function.
 #' }
 #' Column averages of these matrices provide the model-averaged posterior means.
+#' Except for the \code{allocations} matrix, for which the \code{summary_SBM}
+#' needs to be utilized.
 #'
 #' In addition to the analysis results, the bgm output lists some of the
 #' arguments of its call. This is useful for post-processing the results.
@@ -504,10 +519,24 @@ bgm = function(x,
     arguments$data_columnnames = data_columnnames
 
     if(edge_selection == TRUE) {
-      output = list(indicator = indicator,
-                    interactions = interactions,
-                    thresholds = thresholds,
-                    arguments = arguments)
+      if(edge_prior == "Stochastic-Block"){
+
+        summarySbm <- summary_SBM(cluster_allocations = out$allocations)
+
+        output = list(indicator = indicator,
+                      interactions = interactions,
+                      thresholds = thresholds,
+                      allocations = summarySbm$allocations,
+                      clusters = summarySbm$no_clusters,
+                      arguments = arguments)
+      } else {
+
+        output = list(indicator = indicator,
+                           interactions = interactions,
+                           thresholds = thresholds,
+                           arguments = arguments)
+        }
+
     } else {
       output = list(interactions = interactions,
                     thresholds = thresholds,
@@ -557,10 +586,19 @@ bgm = function(x,
     arguments$data_columnnames = data_columnnames
 
     if(edge_selection == TRUE) {
-      output = list(indicator = indicator,
-                    interactions = interactions,
-                    thresholds = thresholds,
-                    arguments = arguments)
+      if(edge_prior == "Stochastic-Block"){
+        output = list(indicator = indicator,
+                      interactions = interactions,
+                      thresholds = thresholds,
+                      allocations = out$allocations,
+                      arguments = arguments)
+      } else {
+
+        output = list(indicator = indicator,
+                      interactions = interactions,
+                      thresholds = thresholds,
+                      arguments = arguments)
+      }
     } else {
       output = list(interactions = interactions,
                     thresholds = thresholds,

diff --git a/R/utility_functions.R b/R/utility_functions.R
@@ -1007,3 +1007,59 @@ compare_reformat_data = function(x,
               missing_index_gr2 = missing_index_gr2,
               na_impute = na_impute))
 }
+
+# Dahl's method to summarize the samples of the cluster_allocations
+#  This function was adapted from the R code accompanying the paper:
+#  Geng, J., Bhattacharya, A., & Pati, D. (2019). Probabilistic Community
+#  Detection With Unknown Number of Communities, Journal of the American
+#  Statistical Association, 114:526, 893-905, DOI:10.1080/01621459.2018.1458618
+
+getDahl <- function(cluster_allocations) {
+  # Dimensions of the input matrix
+  niters <- nrow(cluster_allocations)  # Number of iterations
+  n <- ncol(cluster_allocations)       # Number of nodes
+
+  # Compute membership matrices for each iteration
+  membershipMatrices <- apply(cluster_allocations, 1, function(clusterAssign) {
+    outer(clusterAssign, clusterAssign, FUN = "==")
+  })
+
+  # Reshape membershipMatrices into a list of matrices
+  membershipMatrices <- lapply(seq_len(niters), function(i) {
+    matrix(membershipMatrices[, i], n, n)
+  })
+
+  # Compute the average membership matrix
+  membershipAverage <- Reduce("+", membershipMatrices) / niters
+
+  # Compute squared error for each iteration
+  SqError <- sapply(membershipMatrices, function(x, av) sum((x - av)^2),
+                    av = membershipAverage)
+
+  # Find the iteration with the minimum squared error
+  DahlIndex <- which.min(SqError)
+
+  # Extract the cluster assignment corresponding to the best iteration
+  DahlAns <- cluster_allocations[DahlIndex, , drop = TRUE]
+
+  return(DahlAns)
+}
+
+
+# A function that computes the cluster posterior probabilities and allocations
+
+summary_SBM <- function(cluster_allocations) {
+  # Compute the number of unique clusters for each iteration
+  clusters <- apply(cluster_allocations, 1, function(row) length(unique(row)))
+
+  # Compute the posterior probabilities of the actual unique clusters
+  no_clusters <- table(clusters) / length(clusters)
+
+  # Compute the allocations of the nodes based on Dahl's method
+  allocations <- getDahl(cluster_allocations)
+
+  # Return the results
+  return(list(no_clusters = no_clusters,
+              allocations = allocations))
+}
+
diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib
@@ -1,3 +1,16 @@
+@article{Dahl2009,
+  author       = {David B. Dahl},
+  title        = {Modal clustering in a class of product partition models},
+  journal      = {Bayesian Analysis},
+  volume       = {4},
+  number       = {2},
+  pages        = {243--264},
+  year         = {2009},
+  month        = {June},
+  doi          = {10.1214/09-BA409},
+  url          = {https://doi.org/10.1214/09-BA409}
+}
+
 @article{GengEtAl_2019,
   title={Probabilistic community detection with unknown number of communities},
   author={Geng, J. and Bhattacharya, A. and Pati, D.},
@@ -8,12 +21,14 @@ @article{GengEtAl_2019
   year={2019}}
 
 @article{SekulovskiEtAl_2023,
-  title={Testing conditional independence in psychometric networks: An analysis of three bayesian methods},
-  author={Sekulovski, N. and Keetelaar, S. and Huth, K. and Wagenmakers, {E.-J.} and van Bork, R. and van den Bergh, D. and Marsman, M.},
-  journal={Multivariate Behavioral Research},
-  doi = {10.1080/00273171.2024.2345915},
-  year={in press}
-}
+	author = {Sekulovski, N. and Keetelaar, S. and Huth, K. B. S. and Wagenmakers, Eric-Jan and {van }Bork, R. and {van den }Bergh, D. and Marsman, M.},
+	doi = {10.1080/00273171.2024.2345915},
+	journal = {Multivariate Behavioral Research},
+	pages = {913--933},
+	title = {Testing conditional independence in psychometric networks: {A}n analysis of three {B}ayesian methods},
+	volume = {59},
+	year = {2024},
+	bdsk-url-1 = {https://doi.org/10.1080/00273171.2024.2345915}}
 
 @article{Dienes_2014,
 	author = {Dienes, Z.},

diff --git a/man/bgm.Rd b/man/bgm.Rd
diff --git a/src/gibbs_functions.cpp b/src/gibbs_functions.cpp
@@ -812,6 +812,8 @@ List gibbs_sampler(IntegerMatrix observations,
   int no_interactions = Index.nrow();
   int no_thresholds = sum(no_categories);
   int max_no_categories = max(no_categories);
+  IntegerVector K_values;  // To store sampled K values
+
 
   IntegerVector v = seq(0, no_interactions - 1);
   IntegerVector order(no_interactions);
@@ -864,7 +866,10 @@ List gibbs_sampler(IntegerMatrix observations,
   NumericMatrix cluster_prob(1, 1);
   NumericVector log_Vn(1);
 
-  if(edge_prior == "Stochastic-Block") {
+  // store the allocation indices for each iteration
+  NumericMatrix out_allocations(iter, no_variables);
+
+  if(edge_prior == "Stochastic-Block") { // Initial Configuration of the cluster allocations
     cluster_allocations[0] = 0;
     cluster_allocations[1] = 1;
     for(int i = 2; i < no_variables; i++) {
@@ -1038,11 +1043,17 @@ List gibbs_sampler(IntegerMatrix observations,
   //The post burn-in iterations ------------------------------------------------
   for(int iteration = 0; iteration < iter; iteration++) {
     if (Progress::check_abort()) {
-      if(edge_selection == true) {
+      if(edge_selection == true && edge_prior == "Stochastic-Block") {
+        return List::create(Named("indicator") = out_indicator,
+                            Named("interactions") = out_interactions,
+                            Named("thresholds") = out_thresholds,
+                            Named("allocations") = out_allocations);
+      } if(edge_selection == true && edge_prior != "Stochastic-Block") {
         return List::create(Named("indicator") = out_indicator,
                             Named("interactions") = out_interactions,
                             Named("thresholds") = out_thresholds);
-      } else {
+      }
+      else {
         return List::create(Named("interactions") = out_interactions,
                             Named("thresholds") = out_thresholds);
       }
@@ -1150,6 +1161,7 @@ List gibbs_sampler(IntegerMatrix observations,
                                            beta_bernoulli_alpha,
                                            beta_bernoulli_beta);
 
+
         for(int i = 0; i < no_variables - 1; i++) {
           for(int j = i + 1; j < no_variables; j++) {
             theta(i, j) = cluster_prob(cluster_allocations[i], cluster_allocations[j]);
@@ -1234,15 +1246,26 @@ List gibbs_sampler(IntegerMatrix observations,
         out_thresholds(no_variables - 1, 1) /= iteration + 1;
       }
     }
+
+    for(int i = 0; i < no_variables; i++) {
+      out_allocations(iteration, i) = cluster_allocations[i] + 1;
+    }
   }
 
 
   if(edge_selection == true) {
+    if(edge_prior == "Stochastic-Block"){
     return List::create(Named("indicator") = out_indicator,
                         Named("interactions") = out_interactions,
-                        Named("thresholds") = out_thresholds);
+                        Named("thresholds") = out_thresholds,
+                        Named("allocations") = out_allocations);
+    } else {
+      return List::create(Named("indicator") = out_indicator,
+                          Named("interactions") = out_interactions,
+                          Named("thresholds") = out_thresholds);
+    }
   } else {
-    return List::create(Named("interactions") = out_interactions,
-                        Named("thresholds") = out_thresholds);
+      return List::create(Named("interactions") = out_interactions,
+                          Named("thresholds") = out_thresholds);
   }
 }
diff --git a/src/gibbs_functions_edge_prior.cpp b/src/gibbs_functions_edge_prior.cpp
@@ -337,6 +337,7 @@ IntegerVector block_allocations_mfm_sbm(IntegerVector cluster_assign,
     }
   }
   return cluster_assign;
+
 }
 
 // ----------------------------------------------------------------------------|
@@ -376,4 +377,4 @@ NumericMatrix block_probs_mfm_sbm(IntegerVector cluster_assign,
   }
 
   return block_probs;
-}
+}
diff --git a/vignettes/refs.bib b/vignettes/refs.bib
@@ -1,10 +1,13 @@
+
 @article{SekulovskiEtAl_2023,
-  title={Testing conditional independence in psychometric networks: An analysis of three bayesian methods},
-  author={Sekulovski, N. and Keetelaar, S. and Huth, K. and Wagenmakers, {E.-J.} and van Bork, R. and van den Bergh, D. and Marsman, M.},
-  journal={Multivariate Behavioral Research},
-  doi = {10.1080/00273171.2024.2345915},
-  year={in press}
-}
+	author = {Sekulovski, N. and Keetelaar, S. and Huth, K. B. S. and Wagenmakers, Eric-Jan and {van }Bork, R. and {van den }Bergh, D. and Marsman, M.},
+	doi = {10.1080/00273171.2024.2345915},
+	journal = {Multivariate Behavioral Research},
+	pages = {913--933},
+	title = {Testing conditional independence in psychometric networks: {A}n analysis of three {B}ayesian methods},
+	volume = {59},
+	year = {2024},
+	bdsk-url-1 = {https://doi.org/10.1080/00273171.2024.2345915}}
 
 @article{GengEtAl_2019,
   title={Probabilistic community detection with unknown number of communities},
-Original file line number
+Diff line change
@@ Expand Up @@
         }
       }
       return cluster_assign;
     }
     // ----------------------------------------------------------------------------|
@@ Expand Down Expand Up @@
       }
       return block_probs;
-    }
+    }