search-match-age-estimation/smooth-data.r at master · tobanw/search-match-age-estimation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
# Non-parametric (local-cubic) regression to smooth population stocks

# Command line usage: `Rscript smooth-data.r DB DEST [MODEL]`
#	DB: path to acs_08-16.db
#	DEST: output directory
#	MODEL (optional): ageonly or racedu (default = ageonly)

# Order of traits: husband then wife; age, edu, race
# Notation: husband gets _SP suffix, wife is default
# Min age with edu is 25 because of endogeneity of college
# ACS data: 2008-2016, ages 18-79 (note: AGE_SP is not limited).
#	Note: 2015 and 2016 are missing the RACESING variable


### Usage Options ###

# command line args
args = commandArgs(trailingOnly = TRUE)

db.file <- args[1] # filepath to acs_08-16.db
data.dir <- args[2] # path to dir where 'ageonly16' and 'racedu24' dirs will be
model <- args[3] # use edu and race as types? (even if not, still keep redundant types for compat)

if (model == "ageonly") {
	age.only <- TRUE
} else if (model == "racedu") {
	age.only <- FALSE
} else {
	age.only <- TRUE
	warning(paste("Model", model, "not available, defaulting to 'ageonly'."))
}

out.dir <- file.path(data.dir, ifelse(age.only, "ageonly16", "racedu24"))
dir.create(path = out.dir, showWarnings = FALSE)
message("Start processing '", ifelse(age.only, "ageonly", "racedu"), "' model.")
message("Output directory: ", out.dir)

# number of ACS samples that are pooled
n.samples <- 9 # for acs_08-16 data

# boundaries: initial/terminal age
min.age <- ifelse(age.only, 18, 25) # exclusive
max.age <- 65 # inclusive

# choose what bandwidth to use for ages in marriages and for individuals
ind.bw.cv <- TRUE # set to TRUE to use bw=cv.aic
ind.bw <- 1.0

mar.bw.cv <- FALSE # set to TRUE to use bw=cv.aic, else use manual bw
# bandwidth matrices to use for manual bivariate age smoothing
#	* values tuned by manual validation
pop.bw.var <- ifelse(age.only, 16, 24) # bandwidth matrix variance (16 for age-only, 24 with rac-edu)
pop.bw.cor <- 0.98 # bandwidth matrix correlation (diagonal orientation)
flow.bw.cor <- 0.9
mig.bw.cor <- 0.85

pop.bw <- matrix(pop.bw.var * c(1, pop.bw.cor, pop.bw.cor, 1), nrow = 2, ncol = 2)
flow.bw <- matrix(pop.bw.var * c(1, flow.bw.cor, flow.bw.cor, 1), nrow = 2, ncol = 2)
mig.bw <- matrix(pop.bw.var * c(1, mig.bw.cor, mig.bw.cor, 1), nrow = 2, ncol = 2)

# top 20 largest MSAs
top.msa <- c(35620, 31080, 16980, 19100, 37980, 26420, 47900, 33100, 12060, 14460, 41860, 19820, 38060, 40140, 42660, 33460, 41740, 45300, 41180, 12580)
top_msa <- '(35620, 31080, 16980, 19100, 37980, 26420, 47900, 33100, 12060, 14460, 41860, 19820, 38060, 40140, 42660, 33460, 41740, 45300, 41180, 12580)'


suppressPackageStartupMessages(library(stringr)) # string interpolation
suppressPackageStartupMessages(library(DBI)) # RSQLite database functions
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(np)) # non-parametric regression

source("local-regression.r") # load local-polynomial regression function


# connect to sqlite database
# table names: acs, mig2met
message("DB connection: ", db.file)
db <- dbConnect(RSQLite::SQLite(), db.file)


### Categorization ###

range.white <- '100 and 130'
range.asian <- '400 and 699'
range.asian2 <- '860 and 892'
range.whasian <- '810 and 826'
range.whasian2 <- '910 and 925'
codes.extra <- '(943, 963)'

# Note: SQL uses waterfall logic for case-when -- it stops at the first true when condition
template_non_minority <- c('case',
						   ' when ${race.var} between ${range.white} then 1',
						   ' when ${race.var} between ${range.asian} then 1',
						   ' when ${race.var} between ${range.asian2} then 1',
						   ' when ${race.var} between ${range.whasian} then 1',
						   ' when ${race.var} between ${range.whasian2} then 1',
						   ' when ${race.var} in ${codes.extra} then 1',
						   ' else 2 end')

race.var <-'"RACED"'
case_minority <- str_interp(template_non_minority)

race.var <-'"RACED_SP"'
case_minority_sp <- str_interp(template_non_minority)


template_college <- 'case when ${edu.var} >= 10 then 2 else 1 end'

edu.var <- '"EDUC"'
case_college <- str_interp(template_college)

edu.var <- '"EDUC_SP"'
case_college_sp <- str_interp(template_college)


# customize queries for desired types
if (age.only) {
	ind.types <- '1 as COLLEGE, 1 as MINORITY,'
	husb.types <- '1 as COLLEGE_M, 1 as MINORITY_M,'
	wife.types <- '1 as COLLEGE_F, 1 as MINORITY_F,'
} else {
	ind.types <- paste(case_college, 'as COLLEGE,', case_minority, 'as MINORITY,')
	husb.types <- paste(case_college_sp, 'as COLLEGE_M,', case_minority_sp, 'as MINORITY_M,')
	wife.types <- paste(case_college, 'as COLLEGE_F,', case_minority, 'as MINORITY_F,')
}

ind.grp <- ', COLLEGE, MINORITY'
husb.grp <- ', COLLEGE_M, MINORITY_M'
wife.grp <- ', COLLEGE_F, MINORITY_F'

ind.mrg <- c("COLLEGE", "MINORITY")
husb.mrg <- c("COLLEGE_M", "MINORITY_M")
wife.mrg <- c("COLLEGE_F", "MINORITY_F")

# complete grids to merge in
if (age.only) {
	ind.grid <- CJ(MSA = top.msa, SEX = 1:2, AGE = min.age:79, COLLEGE = 1, MINORITY = 1)
	mar.grid <- CJ(MSA = top.msa, AGE_M = min.age:79, COLLEGE_M = 1, MINORITY_M = 1,
				   AGE_F = min.age:79, COLLEGE_F = 1, MINORITY_F = 1)
} else {
	ind.grid <- CJ(MSA = top.msa, SEX = 1:2, AGE = min.age:79, COLLEGE = 1:2, MINORITY = 1:2)
	mar.grid <- CJ(MSA = top.msa, AGE_M = min.age:79, COLLEGE_M = 1:2, MINORITY_M = 1:2,
				   AGE_F = min.age:79, COLLEGE_F = 1:2, MINORITY_F = 1:2)
}


### Queries ###

# total population counts
qry_tot <- paste('select "MET2013" as MSA, "SEX" as SEX,
					  "AGE" as AGE,', ind.types,
					  'sum("PERWT")/', n.samples, 'as RAW_POP
				  from acs
				  where AGE >=', min.age, 'and MSA in', top_msa, 'and "GQ" < 3
				  group by MSA, SEX, AGE', ind.grp)

# singles counts
qry_sng <- paste('select "MET2013" as MSA, "SEX" as SEX,
					  "AGE" as AGE,', ind.types,
					  'sum("PERWT")/', n.samples, 'as RAW_SNG
				  from acs
				  where AGE >=', min.age, 'and "MARST" >= 3 and MSA in', top_msa, 'and "GQ" < 3
				  group by MSA, SEX, AGE', ind.grp)

# married counts
qry_marr <- paste('select "MET2013" as MSA,
					   "AGE_SP" as AGE_M,', husb.types,
					   '"AGE" as AGE_F,', wife.types,
					   'sum("HHWT")/', n.samples, 'as RAW_MASS
				   from acs
				   where "MARST" <= 2 and "SEX" = 2
					   and AGE_F >=', min.age, 'and AGE_M >=', min.age,
					   'and MSA in', top_msa,
				   'group by MSA, AGE_M', husb.grp, ', AGE_F', wife.grp)

# MF(x,y): counts of new marriages by MSA and type pair
qry_marr_flow <- paste('select "MET2013" as MSA,
							"AGE_SP" as AGE_M,', husb.types,
							'"AGE" as AGE_F,', wife.types,
							'sum("HHWT")/', n.samples, 'as RAW_MF
						from acs
						where "MARRINYR" = 2 and "MARST" <= 2
							and AGE_F >=', min.age, 'and AGE_M >=', min.age,
							'and MSA in', top_msa, 'and "SEX" = 2
						group by MSA, AGE_M', husb.grp,', AGE_F', wife.grp)

# MF(x), MF(y)
#qry_mar_flow <- paste0('select "MET2013" as MSA, "SEX" as SEX,
#	"AGE" as AGE, ', case_college, ' as COLLEGE,', case_minority, ' as MINORITY,
#	sum("PERWT") as MF
#	from acs
#	where "MARRINYR" = 2 and AGE >= ', min.age, ' and MSA in ', top_msa,
#	'group by MSA, SEX, AGE, COLLEGE, MINORITY')

# DF(x), DF(y)
qry_div_flow <- paste('select "MET2013" as MSA, "SEX" as SEX,
						   "AGE" as AGE,', ind.types,
						   'sum("PERWT")/', n.samples, 'as RAW_DF
					   from acs
					   where "DIVINYR" = 2 and AGE >=', min.age, 'and MSA in', top_msa,
					   'group by MSA, SEX, AGE', ind.grp)

# migration inflow: counts of people who weren't in current MSA last year (only defined starting in 2012)
qry_inflow <- paste('select acs."MET2013" as MSA, acs."SEX" as SEX,
						 acs."AGE" as AGE,', ind.types,
						 'sum(acs."PERWT")/', n.samples - 4, 'as INFLOW_RAW
                     from acs
                     left join mig2met as m on acs."MIGPUMA1"=m."MIGPUMA1" and acs."MIGPLAC1"=m."MIGPLAC1"
                     where acs."AGE" >=', min.age, 'and acs."MET2013" in', top_msa, 'and m."MSA" != acs."MET2013"
                         and acs."MIGRATE1D" >= 24 and acs."GQ" < 3 and acs."YEAR" >= 2012
                     group by acs."MET2013", acs."SEX", acs."AGE"', ind.grp)

# migration outflow: counts of people who were in MSA last year but no longer (only defined starting in 2012)
qry_outflow <- paste('select m."MSA" as MSA, acs."SEX" as SEX,
						  acs."AGE" as AGE,', ind.types,
						  'sum(acs."PERWT")/', n.samples - 4, 'as OUTFLOW_RAW
                      from acs
                      left join mig2met as m on acs."MIGPUMA1"=m."MIGPUMA1" and acs."MIGPLAC1"=m."MIGPLAC1"
                      where acs."AGE" >=', min.age, 'and m."MSA" in', top_msa, 'and m."MSA" != acs."MET2013"
						  and acs."MIGRATE1D" >= 24 and acs."GQ" < 3 and acs."YEAR" >= 2012
                      group by m."MSA", acs."SEX", acs."AGE"', ind.grp)

# couple migration inflows (only defined starting in 2012)
qry_inmar <- paste('select acs."MET2013" as MSA,
						acs."AGE_SP" as AGE_M, ', husb.types,
						'acs."AGE" as AGE_F, ', wife.types,
						'sum(acs."HHWT")/', n.samples - 4, 'as INFLOW_RAW
                     from acs
                     left join mig2met as m on acs."MIGPUMA1"=m."MIGPUMA1" and acs."MIGPLAC1"=m."MIGPLAC1"
                     where acs."MET2013" in', top_msa, 'and m."MSA" != acs."MET2013"
                        and "MARST" <= 2 and "SEX" = 2
                        and AGE_F >=', min.age, 'and AGE_M >=', min.age,
						'and acs."MIGRATE1D" >= 24 and acs."YEAR" >= 2012
                     group by acs."MET2013", acs."AGE_SP"', husb.grp,', acs."AGE"', wife.grp)

# couple migration outflows (only defined starting in 2012)
qry_outmar <- paste('select m."MSA" as MSA,
						acs."AGE_SP" as AGE_M,', husb.types,
						'acs."AGE" as AGE_F,', wife.types,
						'sum(acs."HHWT")/', n.samples - 4, 'as OUTFLOW_RAW
                     from acs
                     left join mig2met as m on acs."MIGPUMA1"=m."MIGPUMA1" and acs."MIGPLAC1"=m."MIGPLAC1"
                     where m."MSA" in', top_msa, 'and m."MSA" != acs."MET2013"
                        and "MARST" <= 2 and "SEX" = 2
                        and AGE_F >=', min.age, 'and AGE_M >=', min.age,
                        'and acs."MIGRATE1D" >= 24 and acs."YEAR" >= 2012
                     group by m."MSA", acs."AGE_SP"', husb.grp,', acs."AGE"', wife.grp)


# queries return dataframes, convert to data.table and merge into the complete grid
message("Query: populations...")
pop.dt <- merge(merge(data.table(dbGetQuery(db, qry_tot)), data.table(dbGetQuery(db, qry_sng)),
					  all = TRUE, by = c("MSA", "SEX", "AGE", ind.mrg)),
				ind.grid,
				all.y = TRUE, by = c("MSA", "SEX", "AGE", ind.mrg))

# right join to force to grid (drops husbands > 79)
message("Query: marriage stocks...")
marriages <- merge(data.table(dbGetQuery(db, qry_marr)), mar.grid,
				   all.y = TRUE, by = c("MSA", "AGE_M", husb.mrg, "AGE_F", wife.mrg))

message("Query: marriage flows...")
mar.flow <- merge(data.table(dbGetQuery(db, qry_marr_flow)), mar.grid,
				  all.y = TRUE, by = c("MSA", "AGE_M", husb.mrg, "AGE_F", wife.mrg))
message("Query: divorce flows...")
div.flow <- merge(data.table(dbGetQuery(db, qry_div_flow)), ind.grid,
				  all.y = TRUE, by = c("MSA", "SEX", "AGE", ind.mrg))

message("Query: individual migration outflows...")
ind.mig <- merge(merge(data.table(dbGetQuery(db, qry_inflow)), data.table(dbGetQuery(db, qry_outflow)),
					   all = TRUE, by = c("MSA", "SEX", "AGE", ind.mrg)),
				 ind.grid,
				 all.y = TRUE, by = c("MSA", "SEX", "AGE", ind.mrg))

message("Query: marriage migration outflows...")
mar.mig <- merge(merge(data.table(dbGetQuery(db, qry_inmar)), data.table(dbGetQuery(db, qry_outmar)),
					   all = TRUE, by = c("MSA", "AGE_M", husb.mrg, "AGE_F", wife.mrg)),
				 mar.grid,
				 all.y = TRUE, by = c("MSA", "AGE_M", husb.mrg, "AGE_F", wife.mrg))

# fill NA with zeros before smoothing
pop.dt[is.na(pop.dt)] <- 0
marriages[is.na(RAW_MASS), RAW_MASS := 0]
mar.flow[is.na(RAW_MF), RAW_MF := 0]
div.flow[is.na(RAW_DF), RAW_DF := 0]
ind.mig[is.na(ind.mig)] <- 0
mar.mig[is.na(mar.mig)] <- 0

ind.mig[, NET_OUTFLOW_RAW := OUTFLOW_RAW - INFLOW_RAW]
mar.mig[, NET_OUTFLOW_RAW := OUTFLOW_RAW - INFLOW_RAW]


##### Smoothing by Non-parametric Regression #####

# npregbw cv methods give following bandwidths on age per msa
#	* wom.sng: 2.2
#	* marriages (cv.aic: split sample): (3 hour runtime)
#		* alpha comes out way too bumpy with cv.aic, because of the first differencing
#		* NYC: (0.78,0.77); Chi-town: (0.764,0.75); SF: (0.8,0.8);
#		* playing with manual bw: 1.5 still oddly lumpy
#	* marriages (product kernel): takes >24h for 1/5 starts... aborted

# convert categorical variables to factors for npreg
for (col in ind.mrg) set(pop.dt, j = col, value = factor(pop.dt[[col]]))
for (col in ind.mrg) set(div.flow, j = col, value = factor(div.flow[[col]]))
for (col in ind.mrg) set(ind.mig, j = col, value = factor(ind.mig[[col]]))
for (col in c(husb.mrg, wife.mrg)) set(marriages, j = col, value = factor(marriages[[col]]))
for (col in c(husb.mrg, wife.mrg)) set(mar.flow, j = col, value = factor(mar.flow[[col]]))
for (col in c(husb.mrg, wife.mrg)) set(mar.mig, j = col, value = factor(mar.mig[[col]]))


### Individuals ###

message("Starting non-parametric regression on individual masses.")

if (ind.bw.cv) {
	# cross-validated bandwidth with sample splitting on categoricals
	pop.dt[, `:=`(SNG = predict(npreg(bws = npregbw(formula = RAW_SNG ~ AGE,
												  regtype = "ll",
												  bwmethod = "cv.aic",
												  data = .SD))),
			      POP = predict(npreg(bws = npregbw(formula = RAW_POP ~ AGE,
												  regtype = "ll",
												  bwmethod = "cv.aic",
												  data = .SD)))),
			by = c("MSA", "SEX", ind.mrg)]

	div.flow[, FLOW := predict(npreg(bws = npregbw(formula = RAW_DF ~ AGE,
												 regtype = "ll",
												 bwmethod = "cv.aic",
												 data = .SD))),
			  by = c("MSA", "SEX", ind.mrg)]

	# Note: untested
	ind.mig[, NET_OUTFLOW := predict(npreg(bws=npregbw(formula = NET_OUTFLOW_RAW ~ AGE,
													   regtype="ll",
													   bwmethod="cv.aic",
													   data=.SD))),
			 by = c("MSA", "SEX", ind.mrg)]
} else {
	# manual smoothing with sample splitting
	pop.dt[, `:=`(SNG = predict(npreg(bws=ind.bw, txdat=AGE, tydat=RAW_SNG, regtype="ll")),
				  POP = predict(npreg(bws=ind.bw, txdat=AGE, tydat=RAW_POP, regtype="ll"))),
			by = c("MSA", "SEX", ind.mrg)] # typically use list for `by`, but need vector here to join

	div.flow[, FLOW := predict(npreg(bws=ind.bw, txdat=AGE, tydat=RAW_DF, regtype="ll")),
			   by = c("MSA", "SEX", ind.mrg)]

	ind.mig[, NET_OUTFLOW := predict(npreg(bws=ind.bw, txdat=AGE, tydat=NET_OUTFLOW_RAW, regtype="ll")),
			by = c("MSA", "SEX", ind.mrg)]
}

## combine groups with age >= max.age (truncation at T)

# lump together truncated masses to be merged back in
trm.pop <- pop.dt[AGE >= max.age,
				   .(AGE = max.age, SNG = sum(SNG), POP = sum(POP)),
				   by = c("MSA", "SEX", ind.mrg)]

# drop and merge truncated masses
pop.dt <- merge(pop.dt[AGE < max.age], trm.pop,
				 all = TRUE, by = c("MSA", "SEX", "AGE", ind.mrg, "SNG", "POP"))

# lump together truncated masses to be merged back in
trm.ind.mig <- ind.mig[AGE >= max.age,
					   .(AGE = max.age, NET_OUTFLOW = sum(NET_OUTFLOW)),
					   by = c("MSA", "SEX", ind.mrg)]

# drop and merge truncated masses
ind.mig <- merge(ind.mig[AGE < max.age], trm.ind.mig,
				 all = TRUE, by = c("MSA", "SEX", "AGE", ind.mrg, "NET_OUTFLOW"))


### Marriages ###

message("Starting non-parametric regression on couple masses.")

if (mar.bw.cv) {
	# cross-validated bandwidth with sample splitting
	message("CV bandwidth will take a while...")

	marriages[, MASS := predict(npreg(bws=npregbw(formula = RAW_MASS ~ AGE_M + AGE_F,
												  regtype="ll",
												  bwmethod="cv.aic",
												  data=.SD))),
				by = c("MSA", husb.mrg, wife.mrg)]

	mar.flow[, FLOW := predict(npreg(bws=npregbw(formula = RAW_MF ~ AGE_M + AGE_F,
												 regtype="ll",
												 bwmethod="cv.aic",
												 data=.SD))),
			   by = c("MSA", husb.mrg, wife.mrg)]

	mar.mig[, NET_OUTFLOW := predict(npreg(bws=npregbw(formula = NET_OUTFLOW_RAW ~ AGE_M + AGE_F,
													   regtype="ll",
													   bwmethod="cv.aic",
													   data=.SD))),
			 by = c("MSA", husb.mrg, wife.mrg)]
} else {
	# manual age-only smoothing with kernel oriented along joint aging axis
	message("Smoothing marriage stocks...")
	marriages[, MASS := loc.poly.reg(AGE_M, AGE_F, RAW_MASS, pop.bw, order = 3),
			  by = c("MSA", husb.mrg, wife.mrg)]

	message("Smoothing marriage flows...")
	mar.flow[, FLOW := loc.poly.reg(AGE_M, AGE_F, RAW_MF, flow.bw, order = 3),
			 by = c("MSA", husb.mrg, wife.mrg)]

	message("Smoothing marriage migration outflows...")
	mar.mig[, NET_OUTFLOW := loc.poly.reg(AGE_M, AGE_F, NET_OUTFLOW_RAW, mig.bw, order = 3),
			by = c("MSA", husb.mrg, wife.mrg)]
}

# age-only smoothing
#marriages[, MASS := predict(npreg(bws=c(pop.bw, pop.bw),
#								  txdat=.(AGE_M, AGE_F), tydat=RAW_MASS,
#								  regtype="ll")),
#		   by = c("MSA", husb.mrg, wife.mrg)]
# full product kernel on all 6 variables
#marriages[, MASS := predict(npreg(bws=npregbw(formula = MARRIAGES ~ AGE_M + COLLEGE_M + MINORITY_M + AGE_F + COLLEGE_F + MINORITY_F,
#											  regtype="ll",
#											  bwmethod="cv.aic",
#											  data=.SD)),
#							newdata=.SD),
#          by = MSA]


### Trim, Truncate, and Clean ###

message("Tidying up and saving csv files...")

# both >= max.age
trm.both.mar <- marriages[AGE_M >= max.age & AGE_F >= max.age,
						  .(AGE_M = max.age, AGE_F = max.age, MASS = sum(MASS)),
						  by = c("MSA", husb.mrg, wife.mrg)]
# husband >= max.age, wife < max.age: group within each wife age
trm.husb.mar <- marriages[AGE_M >= max.age & AGE_F < max.age,
					  .(AGE_M = max.age, MASS = sum(MASS)),
					  by = c("MSA", husb.mrg, "AGE_F", wife.mrg)]
# husband < max.age, wife >= max.age: group within each husband age
trm.wife.mar <- marriages[AGE_M < max.age & AGE_F >= max.age,
					  .(AGE_F = max.age, MASS = sum(MASS)),
					  by = c("MSA", "AGE_M", husb.mrg, wife.mrg)]

# both >= max.age
trm.both.mig <- mar.mig[AGE_M >= max.age & AGE_F >= max.age,
						.(AGE_M = max.age, AGE_F = max.age, NET_OUTFLOW = sum(NET_OUTFLOW)),
						by = c("MSA", husb.mrg, wife.mrg)]
# husband >= max.age, wife < max.age: group within each wife age
trm.husb.mig <- mar.mig[AGE_M >= max.age & AGE_F < max.age,
					.(AGE_M = max.age, NET_OUTFLOW = sum(NET_OUTFLOW)),
					by = c("MSA", husb.mrg, "AGE_F", wife.mrg)]
# husband < max.age, wife >= max.age: group within each husband age
trm.wife.mig <- mar.mig[AGE_M < max.age & AGE_F >= max.age,
					.(AGE_F = max.age, NET_OUTFLOW = sum(NET_OUTFLOW)),
					by = c("MSA", "AGE_M", husb.mrg, wife.mrg)]

# drop and merge each segment
marriages <- merge(marriages[AGE_M < max.age & AGE_F < max.age], trm.both.mar, all = TRUE,
				   by = c("MSA", "AGE_M", husb.mrg, "AGE_F", wife.mrg, "MASS"))
marriages <- merge(marriages, trm.husb.mar, all = TRUE,
			  by = c("MSA", "AGE_M", husb.mrg, "AGE_F", wife.mrg, "MASS"))
marriages <- merge(marriages, trm.wife.mar, all = TRUE,
			  by = c("MSA", "AGE_M", husb.mrg, "AGE_F", wife.mrg, "MASS"))

mar.mig <- merge(mar.mig[AGE_M < max.age & AGE_F < max.age], trm.both.mig, all = TRUE,
				 by = c("MSA", "AGE_M", husb.mrg, "AGE_F", wife.mrg, "NET_OUTFLOW"))
mar.mig <- merge(mar.mig, trm.husb.mig, all = TRUE,
				 by = c("MSA", "AGE_M", husb.mrg, "AGE_F", wife.mrg, "NET_OUTFLOW"))
mar.mig <- merge(mar.mig, trm.wife.mig, all = TRUE,
				 by = c("MSA", "AGE_M", husb.mrg, "AGE_F", wife.mrg, "NET_OUTFLOW"))

# trim max.age (NOTE: max.age will be dropped in the estimation because of truncation)
mar.flow <- mar.flow[AGE_M <= max.age & AGE_F <= max.age]
div.flow <- div.flow[AGE <= max.age]

# local regression may give negative or tiny values: truncate to min of 0
marriages[MASS < 0, MASS := 0.0]
mar.flow[FLOW < 0, FLOW := 0.0]
div.flow[FLOW < 0, FLOW := 0.0]


### Save flows and stocks ###

fwrite(marriages, file = file.path(out.dir, "marriages.csv"))
fwrite(pop.dt, file = file.path(out.dir, "pop.csv"))
fwrite(mar.flow, file = file.path(out.dir, "pair-MF.csv"))
fwrite(div.flow, file = file.path(out.dir, "ind-DF.csv"))
fwrite(ind.mig, file = file.path(out.dir, "indiv-migration.csv"))
fwrite(mar.mig, file = file.path(out.dir, "mar-migration.csv"))

message("Done!")