diff --git a/R/bgm.R b/R/bgm.R index 88377f44..799eb625 100644 --- a/R/bgm.R +++ b/R/bgm.R @@ -111,12 +111,14 @@ #' model \code{edge_prior = "Stochastic-Block"} assumes that nodes can be #' organized into blocks or clusters. In principle, the assignment of nodes to #' such clusters is unknown, and the model as implemented here considers all -#' possible options \insertCite{@i.e., specifies a Dirichlet process on the node to block -#' allocation as described by @GengEtAl_2019}{bgms}. This model is advantageous +#' possible options \insertCite{@i.e., specifies a Dirichlet prior on the probability +#' of allocations as described by @GengEtAl_2019}{bgms}. This model is advantageous #' when nodes are expected to fall into distinct clusters. The inclusion #' probabilities for the edges are defined at the level of the clusters, with a #' beta prior for the unknown inclusion probability with shape parameters -#' \code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}. The default is +#' \code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, and a Dirichlet +#' prior on the cluster assignment probabilities with a common concentration +#' parameter \code{dirichlet_alpha}. The default is #' \code{edge_prior = "Bernoulli"}. #' @param inclusion_probability The prior edge inclusion probability for the #' Bernoulli model. Can be a single probability, or a matrix of \code{p} rows @@ -127,7 +129,7 @@ #' positive numbers. Defaults to \code{beta_bernoulli_alpha = 1} and #' \code{beta_bernoulli_beta = 1}. #' @param dirichlet_alpha The shape of the Dirichlet prior on the node-to-block -#' allocation parameters for the Stochastic Block model. +#' allocation probabilities for the Stochastic Block model. #' @param na.action How do you want the function to handle missing data? If #' \code{na.action = "listwise"}, listwise deletion is used. If #' \code{na.action = "impute"}, missing data are imputed iteratively during the @@ -156,8 +158,14 @@ #' ``blume-capel'' variables, the first entry is the parameter for the linear #' effect and the second entry is the parameter for the quadratic effect, which #' models the offset to the reference category. +#' \item In the case of +#' \code{edge_prior = "Stochastic-Block"} two additional elements are returned: +#' a vector \code{allocations} with the estimated cluster assignments of the nodes and an +#' table \code{clusters} with the estimated posterior probability of the number +#' of clusters in the network. The vector of node allocations is calculated using +#' a method proposed by \insertCite{Dahl2009}{bgms} and also used by +#' \insertCite{GengEtAl_2019}{bgms}. #' } -#' #' If \code{save = TRUE}, the result is a list of class ``bgms'' containing: #' \itemize{ #' \item \code{indicator}: A matrix with \code{iter} rows and @@ -169,8 +177,15 @@ #' \item \code{thresholds}: A matrix with \code{iter} rows and #' \code{sum(m)} columns, containing parameter states from every iteration of #' the Gibbs sampler for the category thresholds. +#' \item In the case of +#' \code{edge_prior = "Stochastic-Block"} a matrix \code{allocations} with the +#' cluster assignments of the nodes from each iteration is returned. This matrix +#' can be used to calculate the posterior probability of the number of clusters +#' by utilizing the \code{summary_SBM(bgm_output[["allocations"]])} function. #' } #' Column averages of these matrices provide the model-averaged posterior means. +#' Except for the \code{allocations} matrix, for which the \code{summary_SBM} +#' needs to be utilized. #' #' In addition to the analysis results, the bgm output lists some of the #' arguments of its call. This is useful for post-processing the results. @@ -504,10 +519,24 @@ bgm = function(x, arguments$data_columnnames = data_columnnames if(edge_selection == TRUE) { - output = list(indicator = indicator, - interactions = interactions, - thresholds = thresholds, - arguments = arguments) + if(edge_prior == "Stochastic-Block"){ + + summarySbm <- summary_SBM(cluster_allocations = out$allocations) + + output = list(indicator = indicator, + interactions = interactions, + thresholds = thresholds, + allocations = summarySbm$allocations, + clusters = summarySbm$no_clusters, + arguments = arguments) + } else { + + output = list(indicator = indicator, + interactions = interactions, + thresholds = thresholds, + arguments = arguments) + } + } else { output = list(interactions = interactions, thresholds = thresholds, @@ -557,10 +586,19 @@ bgm = function(x, arguments$data_columnnames = data_columnnames if(edge_selection == TRUE) { - output = list(indicator = indicator, - interactions = interactions, - thresholds = thresholds, - arguments = arguments) + if(edge_prior == "Stochastic-Block"){ + output = list(indicator = indicator, + interactions = interactions, + thresholds = thresholds, + allocations = out$allocations, + arguments = arguments) + } else { + + output = list(indicator = indicator, + interactions = interactions, + thresholds = thresholds, + arguments = arguments) + } } else { output = list(interactions = interactions, thresholds = thresholds, diff --git a/R/utility_functions.R b/R/utility_functions.R index 3acbcb1a..4c12ba61 100644 --- a/R/utility_functions.R +++ b/R/utility_functions.R @@ -1007,3 +1007,59 @@ compare_reformat_data = function(x, missing_index_gr2 = missing_index_gr2, na_impute = na_impute)) } + +# Dahl's method to summarize the samples of the cluster_allocations +# This function was adapted from the R code accompanying the paper: +# Geng, J., Bhattacharya, A., & Pati, D. (2019). Probabilistic Community +# Detection With Unknown Number of Communities, Journal of the American +# Statistical Association, 114:526, 893-905, DOI:10.1080/01621459.2018.1458618 + +getDahl <- function(cluster_allocations) { + # Dimensions of the input matrix + niters <- nrow(cluster_allocations) # Number of iterations + n <- ncol(cluster_allocations) # Number of nodes + + # Compute membership matrices for each iteration + membershipMatrices <- apply(cluster_allocations, 1, function(clusterAssign) { + outer(clusterAssign, clusterAssign, FUN = "==") + }) + + # Reshape membershipMatrices into a list of matrices + membershipMatrices <- lapply(seq_len(niters), function(i) { + matrix(membershipMatrices[, i], n, n) + }) + + # Compute the average membership matrix + membershipAverage <- Reduce("+", membershipMatrices) / niters + + # Compute squared error for each iteration + SqError <- sapply(membershipMatrices, function(x, av) sum((x - av)^2), + av = membershipAverage) + + # Find the iteration with the minimum squared error + DahlIndex <- which.min(SqError) + + # Extract the cluster assignment corresponding to the best iteration + DahlAns <- cluster_allocations[DahlIndex, , drop = TRUE] + + return(DahlAns) +} + + +# A function that computes the cluster posterior probabilities and allocations + +summary_SBM <- function(cluster_allocations) { + # Compute the number of unique clusters for each iteration + clusters <- apply(cluster_allocations, 1, function(row) length(unique(row))) + + # Compute the posterior probabilities of the actual unique clusters + no_clusters <- table(clusters) / length(clusters) + + # Compute the allocations of the nodes based on Dahl's method + allocations <- getDahl(cluster_allocations) + + # Return the results + return(list(no_clusters = no_clusters, + allocations = allocations)) +} + diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 1d63bbfe..18afffc3 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -1,3 +1,16 @@ +@article{Dahl2009, + author = {David B. Dahl}, + title = {Modal clustering in a class of product partition models}, + journal = {Bayesian Analysis}, + volume = {4}, + number = {2}, + pages = {243--264}, + year = {2009}, + month = {June}, + doi = {10.1214/09-BA409}, + url = {https://doi.org/10.1214/09-BA409} +} + @article{GengEtAl_2019, title={Probabilistic community detection with unknown number of communities}, author={Geng, J. and Bhattacharya, A. and Pati, D.}, @@ -8,12 +21,14 @@ @article{GengEtAl_2019 year={2019}} @article{SekulovskiEtAl_2023, - title={Testing conditional independence in psychometric networks: An analysis of three bayesian methods}, - author={Sekulovski, N. and Keetelaar, S. and Huth, K. and Wagenmakers, {E.-J.} and van Bork, R. and van den Bergh, D. and Marsman, M.}, - journal={Multivariate Behavioral Research}, - doi = {10.1080/00273171.2024.2345915}, - year={in press} -} + author = {Sekulovski, N. and Keetelaar, S. and Huth, K. B. S. and Wagenmakers, Eric-Jan and {van }Bork, R. and {van den }Bergh, D. and Marsman, M.}, + doi = {10.1080/00273171.2024.2345915}, + journal = {Multivariate Behavioral Research}, + pages = {913--933}, + title = {Testing conditional independence in psychometric networks: {A}n analysis of three {B}ayesian methods}, + volume = {59}, + year = {2024}, + bdsk-url-1 = {https://doi.org/10.1080/00273171.2024.2345915}} @article{Dienes_2014, author = {Dienes, Z.}, diff --git a/man/bgm.Rd b/man/bgm.Rd index 6c8fe3d5..d091164b 100644 --- a/man/bgm.Rd +++ b/man/bgm.Rd @@ -101,12 +101,14 @@ complexity (number of edges) get the same prior weight. The Stochastic Block model \code{edge_prior = "Stochastic-Block"} assumes that nodes can be organized into blocks or clusters. In principle, the assignment of nodes to such clusters is unknown, and the model as implemented here considers all -possible options \insertCite{@i.e., specifies a Dirichlet process on the node to block -allocation as described by @GengEtAl_2019}{bgms}. This model is advantageous +possible options \insertCite{@i.e., specifies a Dirichlet prior on the probability +of allocations as described by @GengEtAl_2019}{bgms}. This model is advantageous when nodes are expected to fall into distinct clusters. The inclusion probabilities for the edges are defined at the level of the clusters, with a beta prior for the unknown inclusion probability with shape parameters -\code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}. The default is +\code{beta_bernoulli_alpha} and \code{beta_bernoulli_beta}, and a Dirichlet +prior on the cluster assignment probabilities with a common concentration +parameter \code{dirichlet_alpha}. The default is \code{edge_prior = "Bernoulli"}.} \item{inclusion_probability}{The prior edge inclusion probability for the @@ -120,7 +122,7 @@ positive numbers. Defaults to \code{beta_bernoulli_alpha = 1} and \code{beta_bernoulli_beta = 1}.} \item{dirichlet_alpha}{The shape of the Dirichlet prior on the node-to-block -allocation parameters for the Stochastic Block model.} +allocation probabilities for the Stochastic Block model.} \item{na.action}{How do you want the function to handle missing data? If \code{na.action = "listwise"}, listwise deletion is used. If @@ -153,8 +155,14 @@ columns, containing model-averaged category thresholds. In the case of ``blume-capel'' variables, the first entry is the parameter for the linear effect and the second entry is the parameter for the quadratic effect, which models the offset to the reference category. +\item In the case of +\code{edge_prior = "Stochastic-Block"} two additional elements are returned: +a vector \code{allocations} with the estimated cluster assignments of the nodes and an +table \code{clusters} with the estimated posterior probability of the number +of clusters in the network. The vector of node allocations is calculated using +a method proposed by \insertCite{Dahl2009}{bgms} and also used by +\insertCite{GengEtAl_2019}{bgms}. } - If \code{save = TRUE}, the result is a list of class ``bgms'' containing: \itemize{ \item \code{indicator}: A matrix with \code{iter} rows and @@ -166,8 +174,15 @@ iteration of the Gibbs sampler for the pairwise associations. \item \code{thresholds}: A matrix with \code{iter} rows and \code{sum(m)} columns, containing parameter states from every iteration of the Gibbs sampler for the category thresholds. +\item In the case of +\code{edge_prior = "Stochastic-Block"} a matrix \code{allocations} with the +cluster assignments of the nodes from each iteration is returned. This matrix +can be used to calculate the posterior probability of the number of clusters +by utilizing the \code{summary_SBM(bgm_output[["allocations"]])} function. } Column averages of these matrices provide the model-averaged posterior means. +Except for the \code{allocations} matrix, for which the \code{summary_SBM} +needs to be utilized. In addition to the analysis results, the bgm output lists some of the arguments of its call. This is useful for post-processing the results. diff --git a/src/gibbs_functions.cpp b/src/gibbs_functions.cpp index 50ad6529..56e06236 100644 --- a/src/gibbs_functions.cpp +++ b/src/gibbs_functions.cpp @@ -812,6 +812,8 @@ List gibbs_sampler(IntegerMatrix observations, int no_interactions = Index.nrow(); int no_thresholds = sum(no_categories); int max_no_categories = max(no_categories); + IntegerVector K_values; // To store sampled K values + IntegerVector v = seq(0, no_interactions - 1); IntegerVector order(no_interactions); @@ -864,7 +866,10 @@ List gibbs_sampler(IntegerMatrix observations, NumericMatrix cluster_prob(1, 1); NumericVector log_Vn(1); - if(edge_prior == "Stochastic-Block") { + // store the allocation indices for each iteration + NumericMatrix out_allocations(iter, no_variables); + + if(edge_prior == "Stochastic-Block") { // Initial Configuration of the cluster allocations cluster_allocations[0] = 0; cluster_allocations[1] = 1; for(int i = 2; i < no_variables; i++) { @@ -1038,11 +1043,17 @@ List gibbs_sampler(IntegerMatrix observations, //The post burn-in iterations ------------------------------------------------ for(int iteration = 0; iteration < iter; iteration++) { if (Progress::check_abort()) { - if(edge_selection == true) { + if(edge_selection == true && edge_prior == "Stochastic-Block") { + return List::create(Named("indicator") = out_indicator, + Named("interactions") = out_interactions, + Named("thresholds") = out_thresholds, + Named("allocations") = out_allocations); + } if(edge_selection == true && edge_prior != "Stochastic-Block") { return List::create(Named("indicator") = out_indicator, Named("interactions") = out_interactions, Named("thresholds") = out_thresholds); - } else { + } + else { return List::create(Named("interactions") = out_interactions, Named("thresholds") = out_thresholds); } @@ -1150,6 +1161,7 @@ List gibbs_sampler(IntegerMatrix observations, beta_bernoulli_alpha, beta_bernoulli_beta); + for(int i = 0; i < no_variables - 1; i++) { for(int j = i + 1; j < no_variables; j++) { theta(i, j) = cluster_prob(cluster_allocations[i], cluster_allocations[j]); @@ -1234,15 +1246,26 @@ List gibbs_sampler(IntegerMatrix observations, out_thresholds(no_variables - 1, 1) /= iteration + 1; } } + + for(int i = 0; i < no_variables; i++) { + out_allocations(iteration, i) = cluster_allocations[i] + 1; + } } if(edge_selection == true) { + if(edge_prior == "Stochastic-Block"){ return List::create(Named("indicator") = out_indicator, Named("interactions") = out_interactions, - Named("thresholds") = out_thresholds); + Named("thresholds") = out_thresholds, + Named("allocations") = out_allocations); + } else { + return List::create(Named("indicator") = out_indicator, + Named("interactions") = out_interactions, + Named("thresholds") = out_thresholds); + } } else { - return List::create(Named("interactions") = out_interactions, - Named("thresholds") = out_thresholds); + return List::create(Named("interactions") = out_interactions, + Named("thresholds") = out_thresholds); } } \ No newline at end of file diff --git a/src/gibbs_functions_edge_prior.cpp b/src/gibbs_functions_edge_prior.cpp index 50a65d8b..0cdbc535 100644 --- a/src/gibbs_functions_edge_prior.cpp +++ b/src/gibbs_functions_edge_prior.cpp @@ -337,6 +337,7 @@ IntegerVector block_allocations_mfm_sbm(IntegerVector cluster_assign, } } return cluster_assign; + } // ----------------------------------------------------------------------------| @@ -376,4 +377,4 @@ NumericMatrix block_probs_mfm_sbm(IntegerVector cluster_assign, } return block_probs; -} \ No newline at end of file +} diff --git a/vignettes/refs.bib b/vignettes/refs.bib index 44c43c84..1e76345a 100644 --- a/vignettes/refs.bib +++ b/vignettes/refs.bib @@ -1,10 +1,13 @@ + @article{SekulovskiEtAl_2023, - title={Testing conditional independence in psychometric networks: An analysis of three bayesian methods}, - author={Sekulovski, N. and Keetelaar, S. and Huth, K. and Wagenmakers, {E.-J.} and van Bork, R. and van den Bergh, D. and Marsman, M.}, - journal={Multivariate Behavioral Research}, - doi = {10.1080/00273171.2024.2345915}, - year={in press} -} + author = {Sekulovski, N. and Keetelaar, S. and Huth, K. B. S. and Wagenmakers, Eric-Jan and {van }Bork, R. and {van den }Bergh, D. and Marsman, M.}, + doi = {10.1080/00273171.2024.2345915}, + journal = {Multivariate Behavioral Research}, + pages = {913--933}, + title = {Testing conditional independence in psychometric networks: {A}n analysis of three {B}ayesian methods}, + volume = {59}, + year = {2024}, + bdsk-url-1 = {https://doi.org/10.1080/00273171.2024.2345915}} @article{GengEtAl_2019, title={Probabilistic community detection with unknown number of communities},