|
1 | 1 | ## Copyright (C) 2022-2025 Andreas Bertsatos <abertsatos@biol.uoa.gr> |
| 2 | +## Copyright (C) 2025 jayantchauhan <0001jayant@gmail.com> |
2 | 3 | ## |
3 | 4 | ## This file is part of the statistics package for GNU Octave. |
4 | 5 | ## |
|
33 | 34 | ## numeric, string array, or cell array of strings. @var{group} can be [] or |
34 | 35 | ## omitted to compute the mean of the entire sample without grouping. |
35 | 36 | ## |
| 37 | +## When the first input @var{x} is a table, @code{grpstats} computes groupwise |
| 38 | +## summary statistics for the numeric variables in the table and returns the |
| 39 | +## results in a new table. In this case, the grouping variable @var{group} |
| 40 | +## must be given as the name of a table variable, which is typically |
| 41 | +## categorical. Currently table input supports a subset of the statistics |
| 42 | +## available for matrix input, such as @qcode{"mean"} and @qcode{"numel"}. |
| 43 | +## |
36 | 44 | ## @code{[@var{a}, @var{b}, @dots{}] = grpstats (@var{x}, @var{group}, |
37 | 45 | ## @var{whichstats})}, for a numeric matrix X, returns the statistics specified |
38 | 46 | ## by @var{whichstats}, as separate arrays @var{a}, @var{b}, @dots{}. |
|
78 | 86 | function [varargout] = grpstats (x, group, whichstats, varargin) |
79 | 87 | ## Check input arguments |
80 | 88 | narginchk (1, 5) |
| 89 | + ## Table input (datatypes integration) |
| 90 | + if (exist ("istable", "file") && istable (x)) |
| 91 | + if (nargout > 1) |
| 92 | + error ("grpstats: table input currently supports a single output argument."); |
| 93 | + endif |
| 94 | + varargout{1} = __grpstats_table__ (x, group, whichstats, varargin{:}); |
| 95 | + return; |
| 96 | + endif |
| 97 | + |
| 98 | + ## Numeric matrix input (existing behaviour) |
81 | 99 | ## Check X being a vector or 2d matrix of real values |
82 | 100 | if (ndims (x) > 2 || ! isnumeric (x) || islogical (x)) |
83 | 101 | error ("grpstats: X must be a vector or 2d matrix."); |
|
210 | 228 | endfor |
211 | 229 | varargout{l} = group_numel; |
212 | 230 | case "meanci" |
| 231 | + ## allocate as 3-D: [ngroups x nvars x 2] (lower, upper) |
| 232 | + group_meanci = NaN(ngroups, c, 2); |
213 | 233 | for j = 1:ngroups |
214 | 234 | group_x = x(find (group_idx == j), :); |
215 | | - m = mean (group_x, 1, "omitnan") ; |
216 | | - n = size (x, 1) - sum (isnan (group_x), 1); |
217 | | - s = std (group_x, 0, 1, "omitnan") ./ sqrt (n); |
218 | | - d = s .* - tinv (alpha / 2, max (n - 1, [], 1)); |
219 | | - group_meanci(j,:) = [m-d, m+d]; |
| 235 | + for col = 1:c |
| 236 | + col_data = group_x(:, col); |
| 237 | + m = mean (col_data, "omitnan"); |
| 238 | + n = sum (! isnan (col_data)); % explicit per-column count |
| 239 | + if (n <= 1) |
| 240 | + % degenerate: CI degenerates to mean |
| 241 | + group_meanci(j, col, 1) = m; |
| 242 | + group_meanci(j, col, 2) = m; |
| 243 | + continue; |
| 244 | + endif |
| 245 | + s = std (col_data, 0, "omitnan") / sqrt (n); |
| 246 | + tval = -tinv (alpha / 2, n - 1); % scalar |
| 247 | + d = s * tval; |
| 248 | + group_meanci(j, col, 1) = m - d; |
| 249 | + group_meanci(j, col, 2) = m + d; |
| 250 | + endfor |
220 | 251 | endfor |
221 | 252 | varargout{l} = group_meanci; |
222 | 253 | case "predci" |
| 254 | + ## allocate as 3-D: [ngroups x nvars x 2] (lower, upper) |
| 255 | + group_predci = NaN(ngroups, c, 2); |
223 | 256 | for j = 1:ngroups |
224 | 257 | group_x = x(find (group_idx == j), :); |
225 | | - m = mean (group_x, 1, "omitnan") ; |
226 | | - n = size (x, 1) - sum (isnan (group_x), 1); |
227 | | - s = std (group_x, 0, 1, "omitnan") ./ sqrt (1 + (1 ./ n)); |
228 | | - d = s .* - tinv (alpha / 2, max (n - 1, [], 1)); |
229 | | - group_predci(j,:) = [m-d, m+d]; |
| 258 | + for col = 1:c |
| 259 | + col_data = group_x(:, col); |
| 260 | + m = mean (col_data, "omitnan"); |
| 261 | + n = sum (! isnan (col_data)); |
| 262 | + if (n <= 1) |
| 263 | + group_predci(j, col, 1) = m; |
| 264 | + group_predci(j, col, 2) = m; |
| 265 | + continue; |
| 266 | + endif |
| 267 | + s = std (col_data, 0, "omitnan") * sqrt (1 + 1 / n); |
| 268 | + tval = -tinv (alpha / 2, n - 1); |
| 269 | + d = s * tval; |
| 270 | + group_predci(j, col, 1) = m - d; |
| 271 | + group_predci(j, col, 2) = m + d; |
| 272 | + endfor |
230 | 273 | endfor |
231 | 274 | varargout{l} = group_predci; |
232 | 275 | case "gname" |
|
236 | 279 | endswitch |
237 | 280 | endfor |
238 | 281 | endif |
| 282 | + |
| 283 | +endfunction |
| 284 | + |
| 285 | +function stats_tbl = __grpstats_table__ (tbl, group, whichstats, varargin) |
| 286 | + |
| 287 | + if (! istable (tbl)) |
| 288 | + error ("grpstats: internal error, expected table input."); |
| 289 | + endif |
| 290 | + |
| 291 | + ## GROUP must be a variable name (char or string scalar) |
| 292 | + if (ischar (group)) |
| 293 | + group_name = group; |
| 294 | + elseif (isstring (group) && isscalar (group)) |
| 295 | + group_name = char (group); |
| 296 | + else |
| 297 | + error ("grpstats: for table input, GROUP must be a variable name."); |
| 298 | + endif |
| 299 | + |
| 300 | + ## Get grouping column |
| 301 | + try |
| 302 | + gcol = tbl.(group_name); |
| 303 | + catch |
| 304 | + error ("grpstats: grouping variable '%s' not found in table.", group_name); |
| 305 | + end_try_catch |
| 306 | + |
| 307 | + ## Currently only support categorical grouping variable |
| 308 | + if (! iscategorical (gcol)) |
| 309 | + error ("grpstats: for table input, grouping variable must be categorical."); |
| 310 | + endif |
| 311 | + |
| 312 | + ## Normalise whichstats for table input |
| 313 | + if (nargin < 3 || isempty (whichstats)) |
| 314 | + func_names = {"mean"}; |
| 315 | + elseif (ischar (whichstats)) |
| 316 | + func_names = {whichstats}; |
| 317 | + elseif (isstring (whichstats) && isscalar (whichstats)) |
| 318 | + func_names = {char (whichstats)}; |
| 319 | + elseif (iscell (whichstats)) |
| 320 | + func_names = whichstats; |
| 321 | + else |
| 322 | + error ("grpstats: invalid WHICHSTATS for table input."); |
| 323 | + endif |
| 324 | + |
| 325 | + ## Only support a subset initially for table input |
| 326 | + n_funcs = numel (func_names); |
| 327 | + for k = 1:n_funcs |
| 328 | + fname = func_names{k}; |
| 329 | + if (! any (strcmp (fname, {"mean", "numel"}))) |
| 330 | + error ("grpstats: table input currently supports only 'mean' and 'numel'."); |
| 331 | + endif |
| 332 | + endfor |
| 333 | + |
| 334 | + ## Group indices and names from categorical column |
| 335 | + group_names = categories (gcol); |
| 336 | + group_idx = double (gcol(:)); |
| 337 | + ngroups = numel (group_names); |
| 338 | + |
| 339 | + ## Collect numeric data columns (excluding the grouping variable) |
| 340 | + vnames = tbl.Properties.VariableNames; |
| 341 | + data_var_names = {}; |
| 342 | + data_mat = []; |
| 343 | + |
| 344 | + for k = 1:numel (vnames) |
| 345 | + vname = vnames{k}; |
| 346 | + if (strcmp (vname, group_name)) |
| 347 | + continue; |
| 348 | + endif |
| 349 | + col = tbl.(vname); |
| 350 | + if (isnumeric (col)) |
| 351 | + data_mat = [data_mat, col(:)]; |
| 352 | + data_var_names{end+1} = vname; |
| 353 | + endif |
| 354 | + endfor |
| 355 | + |
| 356 | + if (isempty (data_mat)) |
| 357 | + error ("grpstats: no numeric variables found in table (apart from grouping)."); |
| 358 | + endif |
| 359 | + |
| 360 | + nvars = columns (data_mat); |
| 361 | + |
| 362 | + do_mean = any (strcmp ("mean", func_names)); |
| 363 | + do_numel = any (strcmp ("numel", func_names)); |
| 364 | + |
| 365 | + if (do_mean) |
| 366 | + mean_vals = NaN (ngroups, nvars); |
| 367 | + endif |
| 368 | + if (do_numel) |
| 369 | + group_count = accumarray (group_idx(:), 1, [ngroups, 1]); |
| 370 | + endif |
| 371 | + |
| 372 | + ## Compute statistics per group |
| 373 | + for g = 1:ngroups |
| 374 | + idx = (group_idx == g); |
| 375 | + group_data = data_mat(idx, :); |
| 376 | + if (do_mean) |
| 377 | + mean_vals(g,:) = mean (group_data, 1, "omitnan"); |
| 378 | + endif |
| 379 | + endfor |
| 380 | + |
| 381 | + ## Build output table |
| 382 | + ## Group column as categorical using group names |
| 383 | + gcat = categorical (group_names); |
| 384 | + |
| 385 | + varnames_out = {"Group"}; |
| 386 | + data_out = {gcat}; |
| 387 | + |
| 388 | + if (do_numel) |
| 389 | + varnames_out{end+1} = "GroupCount"; |
| 390 | + data_out{end+1} = group_count; |
| 391 | + endif |
| 392 | + |
| 393 | + if (do_mean) |
| 394 | + for k = 1:nvars |
| 395 | + newname = ["mean_" data_var_names{k}]; |
| 396 | + varnames_out{end+1} = newname; |
| 397 | + data_out{end+1} = mean_vals(:, k); |
| 398 | + endfor |
| 399 | + endif |
| 400 | + |
| 401 | + stats_tbl = table (data_out{:}, "VariableNames", varnames_out); |
| 402 | + |
239 | 403 | endfunction |
240 | 404 |
|
241 | 405 | %!demo |
|
272 | 436 | %! load carsmall |
273 | 437 | %! [m,p,g] = grpstats ([Acceleration,Weight/1000], Cylinders, ... |
274 | 438 | %! {"mean", "meanci", "gname"}, 0.05); |
275 | | -%! assert (p(:,1), [11.17621760075134, 16.13845847655224, 16.16222663683362]', ... |
276 | | -%! [1e-14, 2e-14, 1e-14]'); |
| 439 | +%! % check meanci lower bounds (first slice) with tolerance |
| 440 | +%! expected_lower = [15.9163; 15.6622; 10.7968]; |
| 441 | +%! expected_upper = [17.4249; 17.2907; 12.4845]; |
| 442 | +%! assert (abs(p(:,1,1) - expected_lower) < 1e-3); % tolerance 1e-3 or tighter if desired |
| 443 | +%! assert (abs(p(:,1,2) - expected_upper) < 1e-3); |
277 | 444 | %!test |
278 | 445 | %! [mC, g] = grpstats ([], []); |
279 | 446 | %! assert (isempty (mC), true); |
280 | 447 | %! assert (isempty (g), true); |
281 | 448 |
|
| 449 | +## Table input tests (datatypes integration) |
| 450 | +%!test |
| 451 | +%! pkg load datatypes; |
| 452 | +%! Y = [5; 6; 7; 4; 9; 8]; |
| 453 | +%! X = [1; 2; 3; 4; 5; 6]; |
| 454 | +%! Group = categorical ({"A"; "A"; "B"; "B"; "C"; "C"}); |
| 455 | +%! tbl = table (Y, X, Group, "VariableNames", {"Y","X","Group"}); |
| 456 | +%! stats_tbl = grpstats (tbl, "Group", {"mean","numel"}); |
| 457 | +%! assert (istable (stats_tbl)); |
| 458 | +%! assert (isequal (stats_tbl.Properties.VariableNames, ... |
| 459 | +%! {"Group", "GroupCount", "mean_Y", "mean_X"})); |
| 460 | +%! assert (isequal (stats_tbl.GroupCount, [2; 2; 2])); |
| 461 | +%!test |
| 462 | +%! pkg load datatypes; |
| 463 | +%! Y = [5; 6; 7; 4; 9; 8]; |
| 464 | +%! Group = categorical ({"A"; "A"; "B"; "B"; "C"; "C"}); |
| 465 | +%! tbl = table (Y, Group, "VariableNames", {"Y","Group"}); |
| 466 | +%! stats_tbl = grpstats (tbl, "Group", "mean"); |
| 467 | +%! assert (istable (stats_tbl)); |
| 468 | +%! assert (isequal (stats_tbl.Properties.VariableNames, ... |
| 469 | +%! {"Group", "mean_Y"})); |
| 470 | + |
| 471 | +%!error<grpstats: for table input, grouping variable must be categorical.> ... |
| 472 | +%! pkg load datatypes; |
| 473 | +%! Y = [1; 2; 3]; |
| 474 | +%! G = [1; 1; 2]; |
| 475 | +%! tbl = table (Y, G, "VariableNames", {"Y","G"}); |
| 476 | +%! grpstats (tbl, "G", {"mean","numel"}); |
282 | 477 | %!error<grpstats: X must be a vector or 2d matrix.> ... |
283 | 478 | %! grpstats (ones (3, 3, 3)); |
284 | 479 | %!error<grpstats: samples in X and GROUPS mismatch.> ... |
|
0 commit comments