Skip to content

Commit 734b2f0

Browse files
authored
programmatic construction for lead/lag (#176)
* implement and test programmatic construction for lead/lag * add docs, missing terms method, test for schema extraction * pointer to other programmatic section in docs * fix test
1 parent 749fdcd commit 734b2f0

File tree

5 files changed

+122
-2
lines changed

5 files changed

+122
-2
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "StatsModels"
22
uuid = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
3-
version = "0.6.8"
3+
version = "0.6.9"
44

55
[deps]
66
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"

docs/src/temporal_terms.md

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ and following a regular time interval, which may require inserting additional
1717
rows containing `missing`s to fill in gaps in irregular data.
1818

1919
Below is a simple example:
20-
```jldoctest
20+
```jldoctest leadlag
2121
julia> using StatsModels, DataFrames
2222
2323
julia> df = DataFrame(y=1:5, x=2:2:10)
@@ -57,3 +57,41 @@ julia> modelmatrix(f, df)
5757
8 4 missing
5858
10 6 missing
5959
```
60+
61+
### Programmatic construction of lead and lag terms
62+
63+
StatsModels.jl provides methods for `lead` and `lag` that allow `LeadLagTerm`s
64+
to be constructed programmatically (at run time). See the section on
65+
[Constructing a formula programmatically](@ref) for more information. For a
66+
short example, you can produce the same formula as above without the `@formula`
67+
macro like this:
68+
69+
```jldoctest leadlag
70+
julia> y, x = term(:y), term(:x);
71+
72+
julia> f2 = y ~ x + lag(x, 2) + lead(x, 2)
73+
FormulaTerm
74+
Response:
75+
y(unknown)
76+
Predictors:
77+
x(unknown)
78+
lag(x, 2)
79+
lead(x, 2)
80+
81+
julia> f2 = apply_schema(f2, schema(f2, df))
82+
FormulaTerm
83+
Response:
84+
y(continuous)
85+
Predictors:
86+
x(continuous)
87+
lag(x, 2)
88+
lead(x, 2)
89+
90+
julia> modelmatrix(f2, df)
91+
5×3 reshape(::Array{Union{Missing, Int64},2}, 5, 3) with eltype Union{Missing, Int64}:
92+
2 missing 6
93+
4 missing 8
94+
6 2 10
95+
8 4 missing
96+
10 6 missing
97+
```

src/StatsModels.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ module StatsModels
22

33
using Tables
44
using StatsBase
5+
using ShiftedArrays
56
using ShiftedArrays: lag, lead
67
using DataStructures
78
using DataAPI: levels

src/temporal_terms.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ struct LeadLagTerm{T<:AbstractTerm, F<:Union{typeof(lead), typeof(lag)}} <: Abst
2626
nsteps::Int
2727
end
2828

29+
terms(t::LeadLagTerm) = (t.term, )
30+
2931
function apply_schema(t::FunctionTerm{F}, sch::Schema, ctx::Type) where F<:Union{typeof(lead), typeof(lag)}
3032
opname = string(nameof(F.instance))
3133
if length(t.args_parsed) == 1 # lag(term)
@@ -44,6 +46,14 @@ function apply_schema(t::FunctionTerm{F}, sch::Schema, ctx::Type) where F<:Union
4446
return LeadLagTerm{typeof(term), F}(term, nsteps)
4547
end
4648

49+
function apply_schema(t::LeadLagTerm{T, F}, sch::Schema, ctx::Type) where {T,F}
50+
term = apply_schema(t.term, sch, ctx)
51+
LeadLagTerm{typeof(term), F}(term, t.nsteps)
52+
end
53+
54+
ShiftedArrays.lead(t::T, n=1) where {T<:AbstractTerm} = LeadLagTerm{T,typeof(lead)}(t, n)
55+
ShiftedArrays.lag(t::T, n=1) where {T<:AbstractTerm} = LeadLagTerm{T,typeof(lag)}(t, n)
56+
4757
function modelcols(ll::LeadLagTerm{<:Any, F}, d::Tables.ColumnTable) where F
4858
original_cols = modelcols(ll.term, d)
4959
return F.instance(original_cols, ll.nsteps)

test/temporal_terms.jl

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,43 @@ using DataStructures
9191
@test_throws InexactError apply_schema(bad_f, schema(bad_f, df))
9292
end
9393
end # Unhappy Path testset
94+
95+
@testset "Programmatic construction" begin
96+
using StatsModels: LeadLagTerm
97+
98+
df = (y=1:10, x=1:10)
99+
100+
@testset "schema" begin
101+
t = lag(term(:x))
102+
@test schema(t, df).schema == schema(term(:x), df).schema
103+
end
104+
105+
@testset "one-arg" begin
106+
f = @formula(y ~ lag(x))
107+
sch = schema(f, df)
108+
ff = apply_schema(f, sch)
109+
t1 = ff.rhs.terms[1]
110+
t2 = apply_schema(LeadLagTerm{Term, typeof(lag)}(term(:x), 1), sch)
111+
t3 = apply_schema(lag(term(:x)), sch)
112+
113+
@test isequal(modelcols(t1, df), modelcols(t2, df))
114+
@test isequal(modelcols(t1, df), modelcols(t3, df))
115+
@test coefnames(t1) == coefnames(t2) == coefnames(t3)
116+
end
117+
118+
@testset "two-arg" begin
119+
f = @formula(y ~ lag(x, 3))
120+
sch = schema(f, df)
121+
ff = apply_schema(f, sch)
122+
t1 = ff.rhs.terms[1]
123+
t2 = apply_schema(LeadLagTerm{Term, typeof(lag)}(term(:x), 3), sch)
124+
t3 = apply_schema(lag(term(:x), 3), sch)
125+
126+
@test isequal(modelcols(t1, df), modelcols(t2, df))
127+
@test isequal(modelcols(t1, df), modelcols(t3, df))
128+
@test coefnames(t1) == coefnames(t2) == coefnames(t3)
129+
end
130+
end
94131
end # Lag testset
95132

96133
# The code for lag and lead is basically the same, as we tested lag comprehensively above
@@ -109,5 +146,39 @@ using DataStructures
109146

110147
@test coefnames(f)[2] == ["x_lead0", "x_lead1", "x_lead3", "x_lead11"]
111148
end
149+
150+
@testset "Programmatic construction" begin
151+
using StatsModels: LeadLagTerm
152+
153+
df = (y=1:10, x=1:10)
154+
155+
@testset "one-arg" begin
156+
f = @formula(y ~ lead(x))
157+
sch = schema(f, df)
158+
ff = apply_schema(f, sch)
159+
t1 = ff.rhs.terms[1]
160+
t2 = apply_schema(LeadLagTerm{Term, typeof(lead)}(term(:x), 1), sch)
161+
t3 = apply_schema(lead(term(:x)), sch)
162+
163+
@test isequal(modelcols(t1, df), modelcols(t2, df))
164+
@test isequal(modelcols(t1, df), modelcols(t3, df))
165+
@test coefnames(t1) == coefnames(t2) == coefnames(t3)
166+
end
167+
168+
@testset "two-arg" begin
169+
f = @formula(y ~ lead(x, 3))
170+
sch = schema(f, df)
171+
ff = apply_schema(f, sch)
172+
t1 = ff.rhs.terms[1]
173+
t2 = apply_schema(LeadLagTerm{Term, typeof(lead)}(term(:x), 3), sch)
174+
t3 = apply_schema(lead(term(:x), 3), sch)
175+
176+
@test isequal(modelcols(t1, df), modelcols(t2, df))
177+
@test isequal(modelcols(t1, df), modelcols(t3, df))
178+
@test coefnames(t1) == coefnames(t2) == coefnames(t3)
179+
end
180+
end
181+
182+
112183
end
113184
end

0 commit comments

Comments
 (0)