JuliaDiffinDiffs
diff --git a/‎Project.toml‎
Lines changed: 2 additions & 0 deletions b/‎Project.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data/README.md‎
Lines changed: 13 additions & 6 deletions b/‎data/README.md‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎data/hrs.csv‎
Lines changed: 0 additions & 3281 deletions b/‎data/hrs.csv‎
Lines changed: 0 additions & 3281 deletions
diff --git a/‎data/hrs.csv.gz‎
51.7 KB b/‎data/hrs.csv.gz‎
51.7 KB
diff --git a/‎data/mpdta.csv.gz‎
29.4 KB b/‎data/mpdta.csv.gz‎
29.4 KB
diff --git a/‎data/nsw.csv.gz‎
299 KB b/‎data/nsw.csv.gz‎
299 KB
diff --git a/‎data/src/Project.toml‎
Lines changed: 20 additions & 0 deletions b/‎data/src/Project.toml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎data/src/make.jl‎
Lines changed: 112 additions & 0 deletions b/‎data/src/make.jl‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎data/src/make.py‎
Lines changed: 0 additions & 35 deletions b/‎data/src/make.py‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎src/DiffinDiffsBase.jl‎
Lines changed: 2 additions & 1 deletion b/‎src/DiffinDiffsBase.jl‎
Lines changed: 2 additions & 1 deletion
@@ -5,6 +5,7 @@ version = "0.2.1"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
@@ -17,6 +18,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
 CSV = "0.8"
+CodecZlib = "0.7"
 Combinatorics = "1"
 DataAPI = "1.6"
 DataFrames = "0.22"
 
@@ -2,23 +2,30 @@
 
 A collection of data files are provided here for the ease of testing and illustrations.
 The included data are modified from the original sources
-and stored in `.csv` files.
-See [`make.py`](src/make.py) for the source code
-that generates these files from the original data.
+and stored in compressed CSV (`.csv.gz`) files.
+See [`data/src/make.jl`](src/make.jl) for the source code
+that generates these files from original data.
 
 [DiffinDiffsBase.jl](https://github.com/JuliaDiffinDiffs/DiffinDiffsBase.jl)
 provides methods for looking up and loading these example data.
 Call `exampledata()` for a name list of the available datasets.
-To load one of them into a `DataFrame`, use the method `exampledata(name)`.
+To load one of them, call `exampledata(name)`
+where `name` is the `Symbol` of filename without extension (e.g., `:hrs`).
 
 ## Sources and Licenses
 
 | Name | Source | File Link | License | Note |
 | :--- | :----: | :-------: | :-----: | :--- |
-| hrs | [Dobkin et al. (2018)](#DobkinFK18E) | [HRS_long.dta](https://doi.org/10.3886/E116186V1-73160) | [CC BY 4.0](https://doi.org/10.3886/E116186V1-73120) | Data are processed as in [Sun and Abraham (2020)](#SunA20) |
+| hrs | [Dobkin et al. (2018)](https://doi.org/10.1257/aer.20161038) | [HRS_long.dta](https://doi.org/10.3886/E116186V1-73160) | [CC BY 4.0](https://doi.org/10.3886/E116186V1-73120) | Data are processed as in [Sun and Abraham (2020)](https://doi.org/10.1016/j.jeconom.2020.09.006) |
+| nsw | [Diamond and Sekhon (2013)](https://doi.org/10.1162/REST_a_00318) | [ec675_nsw.tab](https://doi.org/10.7910/DVN/23407/DYEWLO) | [CC0 1.0](https://dataverse.org/best-practices/harvard-dataverse-general-terms-use) | Data are rearranged in a long format as in the R package [DRDID](https://github.com/pedrohcgs/DRDID/blob/master/data-raw/nsw.R) |
+| mpdta | [Callaway and Sant'Anna (2020)](https://doi.org/10.1016/j.jeconom.2020.12.001) | [mpdta.rda](https://github.com/bcallaway11/did/blob/master/data/mpdta.rda) | [GPL-2](https://cran.r-project.org/web/licenses/GPL-2) | |
 
 ## References
 
-<a name="DobkinFK18E">**Dobkin, Carlos, Finkelstein, Amy, Kluender, Raymond, and Notowidigdo, Matthew J.** 2018. "Replication data for: The Economic Consequences of Hospital Admissions." *American Economic Association* [publisher], Inter-university Consortium for Political and Social Research [distributor]. https://doi.org/10.3886/E116186V1.</a>
+<a name="CallawayS20">**Callaway, Brantly, and Pedro H. C. Sant'Anna.** 2020. "Difference-in-Differences with Multiple Time Periods." *Journal of Econometrics*, forthcoming.</a>
+
+<a name="DiamondS13G">**Diamond, Alexis and Jasjeet S. Sekhon.** 2013. "Replication data for: Genetic Matching for Estimating Causal Effects: A General Multivariate Matching Method for Achieving Balance in Observational Studies." *MIT Press* [publisher], Harvard Dataverse [distributor]. https://doi.org/10.7910/DVN/23407/DYEWLO.</a>
+
+<a name="DobkinFK18E">**Dobkin, Carlos, Amy Finkelstein, Raymond Kluender, and Matthew J. Notowidigdo.** 2018. "Replication data for: The Economic Consequences of Hospital Admissions." *American Economic Association* [publisher], Inter-university Consortium for Political and Social Research [distributor]. https://doi.org/10.3886/E116186V1.</a>
 
 <a name="SunA20">**Sun, Liyang, and Sarah Abraham.** 2020. "Estimating Dynamic Treatment Effects in Event Studies with Heterogeneous Treatment Effects." *Journal of Econometrics*, forthcoming.</a>
@@ -0,0 +1,20 @@
+[deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
+CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+DataValues = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5"
+FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
+ReadStat = "d71aba96-b539-5138-91ee-935c3ee1374c"
+
+[compat]
+CSV = "0.8"
+CodecBzip2 = "0.7"
+CodecZlib = "0.7"
+DataFrames = "0.22"
+DataValues = "0.4"
+FileIO = "< 1.6"
+RData = "0.7"
+ReadStat = "1"
+julia = "1.3"
@@ -0,0 +1,112 @@
+# Generate example datasets as compressed CSV files
+
+# See data/README.md for the sources of the input data files
+# To regenerate the .csv.gz files:
+# 1) Have all input files ready in the data folder
+# 2) Instantiate the package environment for data/src
+# 3) Run this script and call `make()` with the root folder as working directory
+
+using CSV, CodecBzip2, CodecZlib, DataFrames, DataValues, RData, ReadStat
+
+function _to_array(d::DataValueArray{T}) where T
+    a = Array{T}(undef, size(d))
+    hasmissing = false
+    @inbounds for i in eachindex(d)
+        v = d[i]
+        if hasvalue(v)
+            a[i] = v.value
+        elseif !hasmissing
+            a = convert(Array{Union{T,Missing}}, a)
+            hasmissing = true
+            a[i] = missing
+        else
+            a[i] = missing
+        end
+    end
+    return a
+end
+
+function _get_columns(data::ReadStatDataFrame, names::Vector{Symbol})
+    lookup = Dict(data.headers.=>keys(data.headers))
+    cols = Vector{AbstractVector}(undef, length(names))
+    for (i, n) in enumerate(names)
+        col = data.data[lookup[n]]
+        cols[i] = _to_array(col)
+    end
+    return cols
+end
+
+# The steps for preparing data follow Sun and Abraham (2020)
+function hrs()
+    raw = read_dta("data/HRS_long.dta")
+    names = [:hhidpn, :wave, :wave_hosp, :evt_time, :oop_spend, :riearnsemp, :rwthh,
+        :male, :spouse, :white, :black, :hispanic, :age_hosp]
+    cols = _get_columns(raw, names)
+    df = dropmissing!(DataFrame(cols, names), [:wave, :age_hosp, :evt_time])
+    df = df[(df.wave.>=7).&(df.age_hosp.<=59), :]
+    # Must count wave after the above selection
+    transform!(groupby(df, :hhidpn), nrow=>:nwave, :evt_time => minimum => :evt_time)
+    df = df[(df.nwave.==5).&(df.evt_time.<0), :]
+    transform!(groupby(df, :hhidpn), :wave_hosp => minimum∘skipmissing => :wave_hosp)
+    select!(df, Not([:nwave, :evt_time, :age_hosp]))
+    for n in (:male, :spouse, :white, :black, :hispanic)
+        df[!, n] .= ifelse.(df[!, n].==100, 1, 0)
+    end
+    for n in propertynames(df)
+        if !(n in (:oop_spend, :riearnsemp, :wrthh))
+            df[!, n] .= convert(Array{Int}, df[!, n])
+        end
+    end
+    # Replace the original hh index with enumeration
+    ids = IdDict{Int,Int}()
+    hhidpn = df.hhidpn
+    newid = 0
+    for i in 1:length(hhidpn)
+        oldid = hhidpn[i]
+        id = get(ids, oldid, 0)
+        if id === 0
+            newid += 1
+            ids[oldid] = newid
+            hhidpn[i] = newid
+        else
+            hhidpn[i] = id
+        end
+    end
+    open(GzipCompressorStream, "data/hrs.csv.gz", "w") do stream
+        CSV.write(stream, df)
+    end
+end
+
+# Produce a subset of nsw_long from the DRDID R package
+function nsw()
+    df = DataFrame(CSV.File("data/ec675_nsw.tab", delim='\t'))
+    df = df[(isequal.(df.treated, 0)).|(df.sample.==2), Not([:dwincl, :early_ra])]
+    df.experimental = ifelse.(ismissing.(df.treated), 0, 1)
+    select!(df, Not([:treated, :sample]))
+    df.id = 1:nrow(df)
+    # Convert the data to long format
+    df = stack(df, [:re75, :re78])
+    df.year = ifelse.(df.variable.=="re75", 1975, 1978)
+    select!(df, Not(:variable))
+    rename!(df, :value=>:re)
+    sort!(df, :id)
+    open(GzipCompressorStream, "data/nsw.csv.gz", "w") do stream
+        CSV.write(stream, df)
+    end
+end
+
+# Convert mpdta from the did R package to csv format
+function mpdta()
+    df = load("data/mpdta.rda")["mpdta"]
+    df.first_treat = convert(Vector{Int}, df.first_treat)
+    select!(df, Not(:treat))
+    open(GzipCompressorStream, "data/mpdta.csv.gz", "w") do stream
+        CSV.write(stream, df)
+    end
+end
+
+function make()
+    hrs()
+    nsw()
+    mpdta()
+end
@@ -1,6 +1,7 @@
 module DiffinDiffsBase
 
-using CSV: File
+using CSV
+using CodecZlib: GzipDecompressorStream
 using Combinatorics: combinations
 using DataAPI: refarray, refpool
 using MacroTools: @capture, isexpr, postwalk