fix ordering of two-level lv samples in situations where the rows of the dataset are not arranged by cluster

ecmerkle · ecmerkle · commit 23db309dee3f · 2025-12-22T14:34:47.000-06:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: blavaan
 Title: Bayesian Latent Variable Analysis
-Version: 0.5-9.1376
+Version: 0.5-9.1377
 Authors@R: c(person(given = "Edgar", family = "Merkle",
                     role = c("aut", "cre"),
                     email = "merklee@missouri.edu",
diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,8 @@
 ## Bugs/glitches:
 * For some ordinal models with no latent variables, information criteria are incorrect (inflated effective number of parameters).
 
+* For two-level datasets where rows are not ordered by cluster, latent variable predictions may not be ordered as expected.
+
 # Version 0.5-8
 ## New features
 * This release contains minor improvements and bug fixes.
diff --git a/R/lav_export_stanmarg.R b/R/lav_export_stanmarg.R
@@ -1256,12 +1256,13 @@ lav2standata <- function(lavobject, dosam = FALSE) {
           cidx[[g]] <- cidx[[g]] + max(cidx[[(g - 1)]])
         }
       }
-      cidx <- unlist(cidx)
+      cidx <- unlist(cidx) ## FIXME assumes group 2 observations come after group 1 observations
     }
     mean_d_full <- rowsum.default(as.matrix(dat$YX), cidx) / dat$cluster_size
 
     tmpYX <- split.data.frame(dat$YX, cidx)
     dat$YX <- do.call("rbind", tmpYX)
+    dat$orig_id <- unlist(split(1:nrow(dat$YX), cidx))
     dat$log_lik_x_full <- llx_2l(Lp[[1]], dat$YX, mean_d_full, cidx)
     dat$mean_d_full <- lapply(1:nrow(mean_d_full), function(i) mean_d_full[i, dat$between_idx])
 
diff --git a/R/lvgqs.R b/R/lvgqs.R
@@ -303,7 +303,7 @@ samp_lvs_2lev <- function(mcobj, lavmodel, lavsamplestats, lavdata, lavpartable,
         between.idx <- Lp$between.idx[[2]]
 
         if(length(between.idx) > 0L){
-          YX.B[, between.idx] <- stanorig$YX[!duplicated(Lp$cluster.idx[[2]]), between.idx]
+          YX.B[, between.idx] <- stanorig$YX[order(standata$orig_id),][!duplicated(Lp$cluster.idx[[2]]), between.idx] #stanorig$YX[!duplicated(Lp$cluster.idx[[2]]), between.idx]
         }
 
         ## manipulations to reuse existing lvgqs code
@@ -326,6 +326,7 @@ samp_lvs_2lev <- function(mcobj, lavmodel, lavsamplestats, lavdata, lavpartable,
 
         ## now level 1
         standata <- stanorig
+        ## the YX matrix has been ordered by cluster already:
         clusidx <- rep(1:length(standata$cluster_size), standata$cluster_size)
         standata$YX <- with(standata, YX[, between_idx[(N_between + 1):p_tilde]]) - clusmns[clusidx,]
         modmat1 <- modmats[2 * (1:standata$Ng) - 2 + 1]
@@ -346,10 +347,14 @@ samp_lvs_2lev <- function(mcobj, lavmodel, lavsamplestats, lavdata, lavpartable,
   }
 
   etasamps <- do.call(funcall, loop.args)
-
   etaout <- vector("list", 2)
+  idmap <- standata$orig_id ## to put the lvs back in their original order
   for (i in 1:2) {
-    tmpeta <- lapply(etasamps, function(x) x[[i]])
+    if (i == 1) {
+      tmpeta <- lapply(etasamps, function(x) x[[i]][, order(idmap), ])
+    } else if (i == 2) {
+      tmpeta <- lapply(etasamps, function(x) x[[i]])
+    }
     tmpN <- ifelse(i==1, standata$Ntot, sum(standata$nclus[,2]))
     tmpw9 <- ifelse(i==1, standata$w9use + standata$w9no, standata$w9use_c + standata$w9no_c)
 
diff --git a/R/stanmarg_data.R b/R/stanmarg_data.R
@@ -275,7 +275,7 @@ stanmarg_data <- function(YX = NULL, S = NULL, YXo = NULL, N, Ng, grpnum, # data
                           Ndum = NULL, dum_ov_idx = NULL, dum_lv_idx = NULL, # for bsam
                           Ndum_x = NULL, dum_ov_x_idx = NULL, dum_lv_x_idx = NULL,
                           measnblk = NULL, measblkse = NULL, measorder = NULL, measrevord = NULL,
-                          ngh = NULL, ghnode = NULL, ghwt = NULL,
+                          ngh = NULL, ghnode = NULL, ghwt = NULL, orig_id = NULL,
                           ...) {
   
   dat <- list()
@@ -324,6 +324,7 @@ stanmarg_data <- function(YX = NULL, S = NULL, YXo = NULL, N, Ng, grpnum, # data
   dat$ngh <- ngh
   dat$ghnode <- ghnode
   dat$ghwt <- ghwt
+  dat$orig_id <- orig_id
   
   dat$use_suff <- 1L
   if (ord | multilev) dat$use_suff <- 0L

Original file line number	Diff line number	Diff line change
`@@ -1256,12 +1256,13 @@ lav2standata <- function(lavobject, dosam = FALSE) {`
`1256`	`1256`	`cidx[[g]] <- cidx[[g]] + max(cidx[[(g - 1)]])`
`1257`	`1257`	`}`
`1258`	`1258`	`}`
`1259`		`- cidx <- unlist(cidx)`
	`1259`	`+ cidx <- unlist(cidx) ## FIXME assumes group 2 observations come after group 1 observations`
`1260`	`1260`	`}`
`1261`	`1261`	`mean_d_full <- rowsum.default(as.matrix(dat$YX), cidx) / dat$cluster_size`
`1262`	`1262`
`1263`	`1263`	`tmpYX <- split.data.frame(dat$YX, cidx)`
`1264`	`1264`	`dat$YX <- do.call("rbind", tmpYX)`
	`1265`	`+ dat$orig_id <- unlist(split(1:nrow(dat$YX), cidx))`
`1265`	`1266`	`dat$log_lik_x_full <- llx_2l(Lp[[1]], dat$YX, mean_d_full, cidx)`
`1266`	`1267`	`dat$mean_d_full <- lapply(1:nrow(mean_d_full), function(i) mean_d_full[i, dat$between_idx])`
`1267`	`1268`