Skip to content
This repository was archived by the owner on Mar 11, 2022. It is now read-only.

Commit bba59a4

Browse files
authored
Add PanelStructure and panel operations lead, lag and diff (#22)
1 parent a0edede commit bba59a4

File tree

4 files changed

+360
-32
lines changed

4 files changed

+360
-32
lines changed

src/DiffinDiffsBase.jl

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ using Base: @propagate_inbounds
44
using CSV
55
using CodecZlib: GzipDecompressorStream
66
using Combinatorics: combinations
7-
using DataAPI: refarray, refpool
7+
using DataAPI: refarray, refpool, invrefpool
88
using LinearAlgebra: Diagonal
99
using MacroTools: @capture, isexpr, postwalk
10-
using Missings: disallowmissing
10+
using Missings: allowmissing, disallowmissing
1111
using PooledArrays: _label
1212
using Reexport
1313
using StatsBase: CoefTable, Weights, stderror, uweights
@@ -17,11 +17,11 @@ using StatsModels: Schema
1717
using Tables
1818
using Tables: AbstractColumns, table, istable, columnnames, getcolumn
1919

20-
import Base: ==, show, parent, view
20+
import Base: ==, show, parent, view, diff
2121
import Base: eltype, firstindex, lastindex, getindex, iterate, length, sym_in
2222
import StatsBase: coef, vcov, confint, nobs, dof_residual, responsename, coefnames, weights,
2323
coeftable
24-
import StatsModels: concrete_term, schema, termvars
24+
import StatsModels: concrete_term, schema, termvars, lag, lead
2525

2626
const TimeType = Int
2727

@@ -72,6 +72,14 @@ export cb,
7272

7373
findcell,
7474
cellrows,
75+
PanelStructure,
76+
setpanel,
77+
findlag!,
78+
findlead!,
79+
ilag!,
80+
ilead!,
81+
diff!,
82+
diff,
7583

7684
StatsStep,
7785
AbstractStatsProcedure,

src/operations.jl

Lines changed: 267 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
# Obtain unique labels for row-wise pairs of values from a1 and a2 when mult1 is large enough
2-
function _mult!(a1::Array, mult1::Integer, a2::AbstractArray)
3-
a1 .+= mult1 .* (a2 .- 1)
4-
end
5-
61
# A variant of SplitApplyCombine.groupfind using IdDict instead of Dictionaries.Dictionary
72
function _groupfind(container)
83
T = keytype(container)
@@ -13,6 +8,21 @@ function _groupfind(container)
138
return inds
149
end
1510

11+
function _refs_pool(col::AbstractArray, ref_type::Type{<:Integer}=UInt32)
12+
refs = refarray(col)
13+
pool = refpool(col)
14+
labeled = pool !== nothing
15+
if !labeled
16+
refs, invpool, pool = _label(col, eltype(col), ref_type)
17+
end
18+
return refs, pool, labeled
19+
end
20+
21+
# Obtain unique labels for row-wise pairs of values from a1 and a2 when mult is large enough
22+
function _mult!(a1::AbstractArray, a2::AbstractArray, mult)
23+
a1 .+= mult .* (a2 .- 1)
24+
end
25+
1626
"""
1727
findcell(cols::VecColumnTable)
1828
findcell(names, data, esample=Colon())
@@ -36,26 +46,15 @@ rather than those for the full `data`.
3646
function findcell(cols::VecColumnTable)
3747
ncol = size(cols, 2)
3848
isempty(cols) && throw(ArgumentError("no data column is found"))
39-
col = cols[1]
40-
refs = refarray(col)
41-
pool = refpool(col)
42-
labeled = pool !== nothing && eltype(refs) <: Unsigned
43-
if !labeled
44-
refs, invpool, pool = _label(col)
45-
end
49+
refs, pool, labeled = _refs_pool(cols[1])
4650
mult = length(pool)
4751
if ncol > 1
4852
# Make a copy to be used as cache
49-
labeled && (refs = collect(refs))
53+
labeled && (refs = copy(refs))
5054
@inbounds for n in 2:ncol
51-
col = cols[n]
52-
refsn = refarray(col)
53-
pool = refpool(col)
54-
if pool === nothing || !(eltype(refsn) <: Unsigned)
55-
refsn, invpool, pool = _label(col)
56-
end
55+
refsn, pool, labeled = _refs_pool(cols[n])
5756
multn = length(pool)
58-
_mult!(refs, mult, refsn)
57+
_mult!(refs, refsn, mult)
5958
mult = mult * multn
6059
end
6160
end
@@ -108,3 +107,251 @@ function cellrows(cols::VecColumnTable, refrows::IdDict)
108107
end
109108
return cells, rows
110109
end
110+
111+
"""
112+
PanelStructure{R<:Signed, T1, T2<:TimeType}
113+
114+
Panel data structure defined by unique combinations of unit ids and time periods.
115+
It contains the information required for certain operations such as
116+
[`lag`](@ref) and [`diff`](@ref).
117+
See also [`setpanel`](@ref).
118+
119+
# Fields
120+
- `refs::Vector{R}`: reference values that allow obtaining time gaps by taking differences.
121+
- `invrefs::Dict{R, Int}`: inverse map from `refs` to indices.
122+
- `idpool::Vector{T1}`: unique unit ids.
123+
- `timepool::Vector{T2}`: sorted unique time periods.
124+
- `laginds::Dict{Int, Vector{Int}}`: a map from lag distances to vectors of indices of lagged values.
125+
"""
126+
struct PanelStructure{R<:Signed, T1, T2<:TimeType}
127+
refs::Vector{R}
128+
invrefs::Dict{R, Int}
129+
idpool::Vector{T1}
130+
timepool::Vector{T2}
131+
laginds::Dict{Int, Vector{Int}}
132+
function PanelStructure(refs::Vector, idpool::Vector, timepool::Vector,
133+
laginds::Dict=Dict{Int, Vector{Int}}())
134+
invrefs = Dict{eltype(refs), Int}(ref=>i for (i, ref) in enumerate(refs))
135+
return new{eltype(refs), eltype(idpool), eltype(timepool)}(
136+
refs, invrefs, idpool, timepool, laginds)
137+
end
138+
end
139+
140+
function _scaledrefs_pool(col::AbstractArray, step, ref_type::Type{<:Signed}=Int32)
141+
refs, pool, labeled = _refs_pool(col, ref_type)
142+
labeled && (refs = copy(refs))
143+
npool = length(pool)
144+
spool = sort(pool)
145+
if step === nothing
146+
gaps = view(spool, 2:npool) - view(spool, 1:npool-1)
147+
step = minimum(gaps)
148+
end
149+
pool1 = spool[1]
150+
refmap = Vector{eltype(refs)}(undef, npool)
151+
@inbounds for i in 1:npool
152+
refmap[i] = (pool[i] - pool1) ÷ step + 1
153+
end
154+
@inbounds for i in 1:length(refs)
155+
refs[i] = refmap[refs[i]]
156+
end
157+
return refs, spool
158+
end
159+
160+
"""
161+
setpanel(data, idname, timename, timestep=nothing; ref_type=Int32)
162+
setpanel(id::AbstractArray, time::AbstractArray, timestep=nothing; ref_type=Int32)
163+
164+
Declare a [`PanelStructure`](@ref) which is required for certain operations
165+
such as [`lag`](@ref) and [`diff`](@ref).
166+
Either a `data` table with `idname` and `timename` for columns representing
167+
unit ids and time periods
168+
or two arrays `id` and `time` representing the two columns are required.
169+
In the former case, `data` must be Tables.jl-compatible.
170+
171+
By default, the time interval `timestep` between two adjacent periods is inferred
172+
based on the minimum gap between two values in the `time` column.
173+
The element type of reference values for [`PanelStructure`](@ref)
174+
can be specified with `ref_type`.
175+
176+
!!! note
177+
If the underlying data used to create the [`PanelStructure`](@ref) are modified.
178+
The changes will not be reflected in the existing instances of [`PanelStructure`](@ref).
179+
A new instance needs to be created with `setpanel`.
180+
"""
181+
function setpanel(id::AbstractArray, time::AbstractArray, timestep=nothing;
182+
ref_type::Type{<:Signed}=Int32)
183+
eltype(time) <: TimeType ||
184+
throw(ArgumentError("invalid element type $(eltype(time)) from time column"))
185+
length(id) == length(time) || throw(DimensionMismatch(
186+
"id has length $(length(id)) while time has length $(length(time))"))
187+
refs, idpool, labeled = _refs_pool(id)
188+
labeled && (refs = copy(refs))
189+
trefs, tpool = _scaledrefs_pool(time, timestep, ref_type)
190+
# Multiply 2 to create enough gaps between id groups for the largest possible l
191+
mult = 2 * length(tpool)
192+
_mult!(trefs, refs, mult)
193+
return PanelStructure(trefs, idpool, tpool)
194+
end
195+
196+
function setpanel(data, idname::Union{Symbol,Integer}, timename::Union{Symbol,Integer},
197+
timestep=nothing; ref_type::Type{<:Signed}=Int32)
198+
istable(data) || throw(ArgumentError("input data is not Tables.jl-compatible"))
199+
return setpanel(getcolumn(data, idname), getcolumn(data, timename), timestep,
200+
ref_type=ref_type)
201+
end
202+
203+
show(io::IO, panel::PanelStructure) = print(io, "Panel Structure")
204+
205+
function show(io::IO, ::MIME"text/plain", panel::PanelStructure)
206+
println(io, "Panel Structure:")
207+
println(IOContext(io, :limit=>true, :displaysize=>(1, 80)), " idpool: ", panel.idpool)
208+
println(IOContext(io, :limit=>true, :displaysize=>(1, 80)), " timepool: ", panel.timepool)
209+
print(IOContext(io, :limit=>true, :displaysize=>(1, 80)), " laginds: ", panel.laginds)
210+
end
211+
212+
"""
213+
findlag!(panel::PanelStructure, l::Integer=1)
214+
215+
Construct a vector of indices of the `l`th lagged values
216+
for all id-time combinations of `panel`
217+
and save the result in `panel.laginds`.
218+
If a lagged value does not exist, its index is filled with 0.
219+
See also [`ilag!`](@ref).
220+
"""
221+
function findlag!(panel::PanelStructure, l::Integer=1)
222+
abs(l) < length(panel.timepool) ||
223+
throw(ArgumentError("|l| must be smaller than $(length(panel.timepool)); got $l"))
224+
refs = panel.refs
225+
invrefs = panel.invrefs
226+
T = eltype(refs)
227+
inds = Vector{Int}(undef, size(refs))
228+
l = convert(T, l)
229+
z = zero(T)
230+
@inbounds for i in keys(refs)
231+
ref = refs[i]
232+
inds[i] = get(invrefs, ref-l, z)
233+
end
234+
panel.laginds[l] = inds
235+
return inds
236+
end
237+
238+
"""
239+
findlead!(panel::PanelStructure, l::Integer=1)
240+
241+
Construct a vector of indices of the `l`th lead values
242+
for all id-time combinations of `panel`
243+
and save the result in `panel.laginds`.
244+
If a lead value does not exist, its index is filled with 0.
245+
See also [`ilead!`](@ref).
246+
"""
247+
findlead!(panel::PanelStructure, l::Integer=1) = findlag!(panel, -l)
248+
249+
"""
250+
ilag!(panel::PanelStructure, l::Integer=1)
251+
252+
Return a vector of indices of the `l`th lagged values
253+
for all id-time combinations of `panel`.
254+
The indices are retrieved from [`panel`](@ref) if they have been collected before.
255+
Otherwise, they are created by calling [`findlag!`](@ref).
256+
See also [`ilead!`](@ref).
257+
"""
258+
function ilag!(panel::PanelStructure, l::Integer=1)
259+
il = get(panel.laginds, l, nothing)
260+
return il === nothing ? findlag!(panel, l) : il
261+
end
262+
263+
"""
264+
ilead!(panel::PanelStructure, l::Integer=1)
265+
266+
Return a vector of indices of the `l`th lead values
267+
for all id-time combinations of `panel`.
268+
The indices are retrieved from [`panel`](@ref) if they have been collected before.
269+
Otherwise, they are created by calling [`findlead!`](@ref).
270+
See also [`ilag!`](@ref).
271+
"""
272+
ilead!(panel::PanelStructure, l::Integer=1) = ilag!(panel, -l)
273+
274+
"""
275+
lag(panel::PanelStructure, v::AbstractArray, l::Integer=1; default=missing)
276+
277+
Return a vector of `l`th lagged values of `v` with missing values filled with `default`.
278+
The `panel` structure is respected.
279+
See also [`ilag!`](@ref) and [`lead`](@ref).
280+
"""
281+
function lag(panel::PanelStructure, v::AbstractArray, l::Integer=1; default=missing)
282+
length(v) == length(panel.refs) || throw(DimensionMismatch(
283+
"v has length $(length(v)) while expecting $(length(panel.refs))"))
284+
inds = ilag!(panel, l)
285+
out = default === missing ? similar(v, Union{eltype(v), Missing}) : similar(v)
286+
@inbounds for i in 1:length(v)
287+
out[i] = inds[i] == 0 ? default : v[inds[i]]
288+
end
289+
return out
290+
end
291+
292+
"""
293+
lead(panel::PanelStructure, v::AbstractArray, l::Integer=1; default=missing)
294+
295+
Return a vector of `l`th lead values of `v` with missing values filled with `default`.
296+
The `panel` structure is respected.
297+
See also [`ilead!`](@ref) and [`lag`](@ref).
298+
"""
299+
lead(panel::PanelStructure, v::AbstractArray, l::Integer=1; default=missing) =
300+
lag(panel, v, -l, default=default)
301+
302+
function _diff!(dest::AbstractArray, v::AbstractArray, inds::AbstractArray, default)
303+
@inbounds for i in 1:length(v)
304+
dest[i] = inds[i] == 0 ? default : v[i] - v[inds[i]]
305+
end
306+
end
307+
308+
"""
309+
diff!(dest::AbstractArray, panel::PanelStructure, v::AbstractArray; kwargs...)
310+
311+
Take the differences of `v` within observations for each unit in `panel`
312+
and store the result in `dest`.
313+
By default, it calculates the first differences.
314+
See also [`diff`](@ref).
315+
316+
# Keywords
317+
- `order::Integer=1`: the order of differences to be taken.
318+
- `l::Integer=1`: the time interval between each pair of observations.
319+
- `default=missing`: default values for indices where the differences do not exist.
320+
"""
321+
function diff!(dest::AbstractArray, panel::PanelStructure, v::AbstractArray;
322+
order::Integer=1, l::Integer=1, default=missing)
323+
length(dest) == length(v) || throw(DimensionMismatch(
324+
"dest has length $(length(dest)) while v has length $(length(v))"))
325+
0 < order < length(panel.timepool) || throw(ArgumentError(
326+
"order must be between 0 and $(length(panel.timepool)); got $order"))
327+
inds = get(panel.laginds, l, nothing)
328+
inds === nothing && (inds = findlag!(panel, l))
329+
_diff!(dest, v, inds, default)
330+
if order > 1
331+
cache = similar(dest)
332+
for i in 2:order
333+
copy!(cache, dest)
334+
_diff!(dest, cache, inds, default)
335+
end
336+
end
337+
return dest
338+
end
339+
340+
"""
341+
diff(panel::PanelStructure, v::AbstractArray; kwargs...)
342+
343+
Return the differences of `v` within observations for each unit in `panel`.
344+
By default, it calculates the first differences.
345+
See also [`diff!`](@ref).
346+
347+
# Keywords
348+
- `order::Integer=1`: the order of differences to be taken.
349+
- `l::Integer=1`: the time interval between each pair of observations.
350+
- `default=missing`: default values for indices where the differences do not exist.
351+
"""
352+
function diff(panel::PanelStructure, v::AbstractArray;
353+
order::Integer=1, l::Integer=1, default=missing)
354+
out = default === missing ? similar(v, Union{eltype(v), Missing}) : similar(v)
355+
diff!(out, panel, v, order=order, l=l, default=default)
356+
return out
357+
end

0 commit comments

Comments
 (0)