|
| 1 | +# Generate example datasets as compressed CSV files |
| 2 | + |
| 3 | +# See data/README.md for the sources of the input data files |
| 4 | +# To regenerate the .csv.gz files: |
| 5 | +# 1) Have all input files ready in the data folder |
| 6 | +# 2) Instantiate the package environment for data/src |
| 7 | +# 3) Run this script and call `make()` with the root folder as working directory |
| 8 | + |
| 9 | +using CSV, CodecBzip2, CodecZlib, DataFrames, DataValues, RData, ReadStat |
| 10 | + |
| 11 | +function _to_array(d::DataValueArray{T}) where T |
| 12 | + a = Array{T}(undef, size(d)) |
| 13 | + hasmissing = false |
| 14 | + @inbounds for i in eachindex(d) |
| 15 | + v = d[i] |
| 16 | + if hasvalue(v) |
| 17 | + a[i] = v.value |
| 18 | + elseif !hasmissing |
| 19 | + a = convert(Array{Union{T,Missing}}, a) |
| 20 | + hasmissing = true |
| 21 | + a[i] = missing |
| 22 | + else |
| 23 | + a[i] = missing |
| 24 | + end |
| 25 | + end |
| 26 | + return a |
| 27 | +end |
| 28 | + |
| 29 | +function _get_columns(data::ReadStatDataFrame, names::Vector{Symbol}) |
| 30 | + lookup = Dict(data.headers.=>keys(data.headers)) |
| 31 | + cols = Vector{AbstractVector}(undef, length(names)) |
| 32 | + for (i, n) in enumerate(names) |
| 33 | + col = data.data[lookup[n]] |
| 34 | + cols[i] = _to_array(col) |
| 35 | + end |
| 36 | + return cols |
| 37 | +end |
| 38 | + |
| 39 | +# The steps for preparing data follow Sun and Abraham (2020) |
| 40 | +function hrs() |
| 41 | + raw = read_dta("data/HRS_long.dta") |
| 42 | + names = [:hhidpn, :wave, :wave_hosp, :evt_time, :oop_spend, :riearnsemp, :rwthh, |
| 43 | + :male, :spouse, :white, :black, :hispanic, :age_hosp] |
| 44 | + cols = _get_columns(raw, names) |
| 45 | + df = dropmissing!(DataFrame(cols, names), [:wave, :age_hosp, :evt_time]) |
| 46 | + df = df[(df.wave.>=7).&(df.age_hosp.<=59), :] |
| 47 | + # Must count wave after the above selection |
| 48 | + transform!(groupby(df, :hhidpn), nrow=>:nwave, :evt_time => minimum => :evt_time) |
| 49 | + df = df[(df.nwave.==5).&(df.evt_time.<0), :] |
| 50 | + transform!(groupby(df, :hhidpn), :wave_hosp => minimum∘skipmissing => :wave_hosp) |
| 51 | + select!(df, Not([:nwave, :evt_time, :age_hosp])) |
| 52 | + for n in (:male, :spouse, :white, :black, :hispanic) |
| 53 | + df[!, n] .= ifelse.(df[!, n].==100, 1, 0) |
| 54 | + end |
| 55 | + for n in propertynames(df) |
| 56 | + if !(n in (:oop_spend, :riearnsemp, :wrthh)) |
| 57 | + df[!, n] .= convert(Array{Int}, df[!, n]) |
| 58 | + end |
| 59 | + end |
| 60 | + # Replace the original hh index with enumeration |
| 61 | + ids = IdDict{Int,Int}() |
| 62 | + hhidpn = df.hhidpn |
| 63 | + newid = 0 |
| 64 | + for i in 1:length(hhidpn) |
| 65 | + oldid = hhidpn[i] |
| 66 | + id = get(ids, oldid, 0) |
| 67 | + if id === 0 |
| 68 | + newid += 1 |
| 69 | + ids[oldid] = newid |
| 70 | + hhidpn[i] = newid |
| 71 | + else |
| 72 | + hhidpn[i] = id |
| 73 | + end |
| 74 | + end |
| 75 | + open(GzipCompressorStream, "data/hrs.csv.gz", "w") do stream |
| 76 | + CSV.write(stream, df) |
| 77 | + end |
| 78 | +end |
| 79 | + |
| 80 | +# Produce a subset of nsw_long from the DRDID R package |
| 81 | +function nsw() |
| 82 | + df = DataFrame(CSV.File("data/ec675_nsw.tab", delim='\t')) |
| 83 | + df = df[(isequal.(df.treated, 0)).|(df.sample.==2), Not([:dwincl, :early_ra])] |
| 84 | + df.experimental = ifelse.(ismissing.(df.treated), 0, 1) |
| 85 | + select!(df, Not([:treated, :sample])) |
| 86 | + df.id = 1:nrow(df) |
| 87 | + # Convert the data to long format |
| 88 | + df = stack(df, [:re75, :re78]) |
| 89 | + df.year = ifelse.(df.variable.=="re75", 1975, 1978) |
| 90 | + select!(df, Not(:variable)) |
| 91 | + rename!(df, :value=>:re) |
| 92 | + sort!(df, :id) |
| 93 | + open(GzipCompressorStream, "data/nsw.csv.gz", "w") do stream |
| 94 | + CSV.write(stream, df) |
| 95 | + end |
| 96 | +end |
| 97 | + |
| 98 | +# Convert mpdta from the did R package to csv format |
| 99 | +function mpdta() |
| 100 | + df = load("data/mpdta.rda")["mpdta"] |
| 101 | + df.first_treat = convert(Vector{Int}, df.first_treat) |
| 102 | + select!(df, Not(:treat)) |
| 103 | + open(GzipCompressorStream, "data/mpdta.csv.gz", "w") do stream |
| 104 | + CSV.write(stream, df) |
| 105 | + end |
| 106 | +end |
| 107 | + |
| 108 | +function make() |
| 109 | + hrs() |
| 110 | + nsw() |
| 111 | + mpdta() |
| 112 | +end |
0 commit comments