Skip to content

Commit 81c9a0a

Browse files
committed
better surname transparency; fixes #25 #26
1 parent ae465b9 commit 81c9a0a

File tree

4 files changed

+66
-15
lines changed

4 files changed

+66
-15
lines changed

DESCRIPTION

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: birdie
22
Title: Bayesian Instrumental Regression for Disparity Estimation
3-
Version: 0.7.1
3+
Version: 0.7.1.9000
44
Authors@R: c(
55
person("Cory", "McCartan", role=c("aut", "cre"), email="mccartan@psu.edu"),
66
person("Kosuke", "Imai", role="ctb", email="imai@harvard.edu"),
@@ -9,13 +9,13 @@ Authors@R: c(
99
person("Robin", "Fisher", role="ctb", email="robin.fisher@treasury.gov"),
1010
person("The Stan", "Development Team", role = "cph", comment = "include/rstan")
1111
)
12-
Description: Bayesian models for accurately estimating conditional
13-
distributions by race, using Bayesian Improved Surname Geocoding (BISG)
12+
Description: Bayesian models for accurately estimating conditional
13+
distributions by race, using Bayesian Improved Surname Geocoding (BISG)
1414
probability estimates of individual race. Implements the methods described
1515
in McCartan, Fisher, Goldin, Ho and Imai (2025) <doi:10.1080/01621459.2025.2526695>.
16-
Depends:
16+
Depends:
1717
R (>= 3.5.0)
18-
Imports:
18+
Imports:
1919
rlang (>= 0.1.2),
2020
Rcpp (>= 0.12.0),
2121
cli,
@@ -27,7 +27,7 @@ Imports:
2727
stringr,
2828
RcppParallel (>= 5.0.1),
2929
SQUAREM
30-
Suggests:
30+
Suggests:
3131
daarem,
3232
easycensus,
3333
wru,
@@ -36,7 +36,7 @@ Suggests:
3636
rmarkdown,
3737
rstan,
3838
testthat (>= 3.0.0)
39-
LinkingTo:
39+
LinkingTo:
4040
Rcpp (>= 0.12.0),
4141
cli,
4242
BH (>= 1.66.0),

R/bisg.R

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,18 @@
77
#' and uses a Gibbs sampler to impute the individual race probabilities, using
88
#' the model of Imai et al. (2022).
99
#'
10+
#' # Surname Matching
11+
#' The Census surname table can be inspected with the following code:
12+
#' ```r
13+
#' readRDS(system.file("extdata", "names_2010_counts.rds", package="birdie"))
14+
#' ```
15+
#' Surnames are processed with [proc_name()] before being matched to the table.
16+
#' The code also recognizes double-barrelled (hyphenated) surnames and attempts
17+
#' to match each part if the overall name is not found in the surname table.
18+
#' Specifying `save_rs=TRUE` will save the matched surname table and a lookup
19+
#' vector that matches each individual to their surname table row. The overall
20+
#' match rate is reported as part of the `summary()` output.
21+
#'
1022
#' @param formula A formula specifying the BISG model. Must include the special
1123
#' term `nm()` to identify the surname variable. Certain geographic variables
1224
#' can be identified similarly: `zip()` for ZIP codes, and `state()` for
@@ -51,6 +63,8 @@
5163
#' @param save_rgx If `TRUE`, save the `p_rgx` table (matched to each
5264
#' individual) as the `"p_rgx"` and `"gx"` attributes of the output.
5365
#' Necessary for some sensitivity analyses.
66+
#' @param save_sr If `TRUE`, save the `p_sr` table (surname given race; matched
67+
#' to each individual as the `"p_sr"` and `"s"` attributes of the output.
5468
#'
5569
#' @return An object of class `bisg`, which is just a data frame with some
5670
#' additional attributes. The data frame has rows matching the input data and
@@ -82,7 +96,7 @@
8296
#' @concept bisg
8397
#' @export
8498
bisg <- function(formula, data=NULL, p_r=p_r_natl(), p_rgx=NULL, p_rs=NULL,
85-
save_rgx=TRUE) {
99+
save_rgx=TRUE, save_sr=FALSE) {
86100
vars = parse_bisg_form(formula, data)
87101

88102
l_name = make_name_tbl_vec(vars, p_r, p_rs, FALSE)
@@ -100,6 +114,11 @@ bisg <- function(formula, data=NULL, p_r=p_r_natl(), p_rgx=NULL, p_rs=NULL,
100114
attr(out, "p_rgx") = l_gx$p_rgx
101115
attr(out, "gx") = l_gx$GX
102116
}
117+
if (isTRUE(save_sr)) {
118+
attr(out, "p_rs") = l_name$p_sr
119+
attr(out, "s") = l_name$S
120+
}
121+
attr(out, "unmatched") = c(s=l_name$unmatched, gx=l_gx$unmatched)
103122
attr(out, "method") = "std"
104123

105124
out
@@ -148,6 +167,7 @@ bisg_me <- function(formula, data=NULL, p_r=p_r_natl(), p_rgx=NULL, p_rs=NULL,
148167
attr(out, "S_name") = vars$S_name
149168
attr(out, "GX_names") = colnames(vars$GX)
150169
attr(out, "p_r") = l_gx$p_r
170+
attr(out, "unmatched") = c(s=l_name$unmatched, gx=l_gx$unmatched)
151171
attr(out, "method") = "me"
152172

153173
out
@@ -299,8 +319,11 @@ make_name_tbl_vec <- function(vars, p_r, p_rs, for_me=FALSE) {
299319
p_sr = p_sr[, idx_names]
300320
}
301321

302-
list(S = S,
303-
p_sr = p_sr) # p_sr is actualy p_rs, unnormalized, if `for_me=TRUE`
322+
list(
323+
S = S,
324+
p_sr = p_sr, # p_sr is actualy p_rs, unnormalized, if `for_me=TRUE`
325+
unmatched = sum(S == "<generic>")
326+
)
304327
}
305328

306329
# Prepare geo/covariate vector and P(G, X | R) table
@@ -433,10 +456,13 @@ make_gx_tbl_vec <- function(vars, p_r, p_rgx) {
433456
p_r = p_r / sum(p_r)
434457
}
435458

436-
list(GX = GX_vec,
437-
p_r = p_r,
438-
p_rgx = p_rgx,
439-
p_gxr = p_gxr)
459+
list(
460+
p_r = p_r,
461+
GX = GX_vec,
462+
p_rgx = p_rgx,
463+
p_gxr = p_gxr,
464+
unmatched = sum(vars$GX[[1]] == "<none>")
465+
)
440466
}
441467

442468
# Call Bayes' rule C++

R/generics.R

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,13 @@ summary.bisg <- function(object, p_r=NULL, ...) {
366366
}
367367
cat("\n")
368368

369+
unm = attr(object, "unmatched")
370+
cli::cli_text("Unmatched surnames: {unm['s']} ({format(unm['s']/nrow(object)*100, digits=3)}%)")
371+
if ("gx" %in% names(unm)) {
372+
cli::cli_text("Unmatched geographies/covariates: {unm['gx']} ({format(unm['gx']/nrow(object)*100, digits=3)}%)")
373+
}
374+
375+
cat("\n")
369376
if (is.null(p_r)) {
370377
cli::cat_line("Implied marginal race distribution:")
371378
p_r = colMeans(object)

man/bisg.Rd

Lines changed: 19 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)