programmatic construction for lead/lag (#176)

kleinschmidt · web-flow · commit 734b2f08f82c · 2020-03-03T20:50:55.000-05:00
* implement and test programmatic construction for lead/lag

* add docs, missing terms method, test for schema extraction

* pointer to other programmatic section in docs

* fix test
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "StatsModels"
 uuid = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
-version = "0.6.8"
+version = "0.6.9"
 
 [deps]
 DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
diff --git a/docs/src/temporal_terms.md b/docs/src/temporal_terms.md
@@ -17,7 +17,7 @@ and following a regular time interval, which may require inserting additional
 rows containing `missing`s  to fill in gaps in irregular data.
 
 Below is a simple example:
-```jldoctest
+```jldoctest leadlag
 julia> using StatsModels, DataFrames
 
 julia> df = DataFrame(y=1:5, x=2:2:10)
@@ -57,3 +57,41 @@ julia> modelmatrix(f, df)
   8  4           missing
  10  6           missing
 ```
+
+### Programmatic construction of lead and lag terms
+
+StatsModels.jl provides methods for `lead` and `lag` that allow `LeadLagTerm`s
+to be constructed programmatically (at run time).  See the section on
+[Constructing a formula programmatically](@ref) for more information.  For a
+short example, you can produce the same formula as above without the `@formula`
+macro like this:
+
+```jldoctest leadlag
+julia> y, x = term(:y), term(:x);
+
+julia> f2 = y ~ x + lag(x, 2) + lead(x, 2)
+FormulaTerm
+Response:
+  y(unknown)
+Predictors:
+  x(unknown)
+  lag(x, 2)
+  lead(x, 2)
+
+julia> f2 = apply_schema(f2, schema(f2, df))
+FormulaTerm
+Response:
+  y(continuous)
+Predictors:
+  x(continuous)
+  lag(x, 2)
+  lead(x, 2)
+
+julia> modelmatrix(f2, df)
+5×3 reshape(::Array{Union{Missing, Int64},2}, 5, 3) with eltype Union{Missing, Int64}:
+  2   missing   6       
+  4   missing   8       
+  6  2         10       
+  8  4           missing
+ 10  6           missing
+```
diff --git a/src/StatsModels.jl b/src/StatsModels.jl
@@ -2,6 +2,7 @@ module StatsModels
 
 using Tables
 using StatsBase
+using ShiftedArrays
 using ShiftedArrays: lag, lead
 using DataStructures
 using DataAPI: levels
diff --git a/src/temporal_terms.jl b/src/temporal_terms.jl
@@ -26,6 +26,8 @@ struct LeadLagTerm{T<:AbstractTerm, F<:Union{typeof(lead), typeof(lag)}} <: Abst
     nsteps::Int
 end
 
+terms(t::LeadLagTerm) = (t.term, )
+
 function apply_schema(t::FunctionTerm{F}, sch::Schema, ctx::Type) where F<:Union{typeof(lead), typeof(lag)}
     opname = string(nameof(F.instance))
     if length(t.args_parsed) == 1  # lag(term)
@@ -44,6 +46,14 @@ function apply_schema(t::FunctionTerm{F}, sch::Schema, ctx::Type) where F<:Union
     return LeadLagTerm{typeof(term), F}(term, nsteps)
 end
 
+function apply_schema(t::LeadLagTerm{T, F}, sch::Schema, ctx::Type) where {T,F}
+    term = apply_schema(t.term, sch, ctx)
+    LeadLagTerm{typeof(term), F}(term, t.nsteps)
+end
+
+ShiftedArrays.lead(t::T, n=1) where {T<:AbstractTerm} = LeadLagTerm{T,typeof(lead)}(t, n)
+ShiftedArrays.lag(t::T, n=1) where {T<:AbstractTerm} = LeadLagTerm{T,typeof(lag)}(t, n)
+
 function modelcols(ll::LeadLagTerm{<:Any, F}, d::Tables.ColumnTable) where F
     original_cols = modelcols(ll.term, d)
     return F.instance(original_cols, ll.nsteps)
diff --git a/test/temporal_terms.jl b/test/temporal_terms.jl
@@ -91,6 +91,43 @@ using DataStructures
                 @test_throws InexactError apply_schema(bad_f, schema(bad_f, df))
             end
         end # Unhappy Path testset
+
+        @testset "Programmatic construction" begin
+            using StatsModels: LeadLagTerm
+
+            df = (y=1:10, x=1:10)
+
+            @testset "schema" begin
+                t = lag(term(:x))
+                @test schema(t, df).schema == schema(term(:x), df).schema
+            end
+
+            @testset "one-arg" begin 
+                f = @formula(y ~ lag(x))
+                sch = schema(f, df)
+                ff = apply_schema(f, sch)
+                t1 = ff.rhs.terms[1]
+                t2 = apply_schema(LeadLagTerm{Term, typeof(lag)}(term(:x), 1), sch)
+                t3 = apply_schema(lag(term(:x)), sch)
+
+                @test isequal(modelcols(t1, df), modelcols(t2, df))
+                @test isequal(modelcols(t1, df), modelcols(t3, df))
+                @test coefnames(t1) == coefnames(t2) == coefnames(t3)
+            end
+
+            @testset "two-arg" begin
+                f = @formula(y ~ lag(x, 3))
+                sch = schema(f, df)
+                ff = apply_schema(f, sch)
+                t1 = ff.rhs.terms[1]
+                t2 = apply_schema(LeadLagTerm{Term, typeof(lag)}(term(:x), 3), sch)
+                t3 = apply_schema(lag(term(:x), 3), sch)
+
+                @test isequal(modelcols(t1, df), modelcols(t2, df))
+                @test isequal(modelcols(t1, df), modelcols(t3, df))
+                @test coefnames(t1) == coefnames(t2) == coefnames(t3)
+            end
+        end
     end # Lag testset
 
     # The code for lag and lead is basically the same, as we tested lag comprehensively above
@@ -109,5 +146,39 @@ using DataStructures
 
             @test coefnames(f)[2] == ["x_lead0", "x_lead1", "x_lead3", "x_lead11"]
         end
+
+        @testset "Programmatic construction" begin
+            using StatsModels: LeadLagTerm
+
+            df = (y=1:10, x=1:10)
+
+            @testset "one-arg" begin 
+                f = @formula(y ~ lead(x))
+                sch = schema(f, df)
+                ff = apply_schema(f, sch)
+                t1 = ff.rhs.terms[1]
+                t2 = apply_schema(LeadLagTerm{Term, typeof(lead)}(term(:x), 1), sch)
+                t3 = apply_schema(lead(term(:x)), sch)
+
+                @test isequal(modelcols(t1, df), modelcols(t2, df))
+                @test isequal(modelcols(t1, df), modelcols(t3, df))
+                @test coefnames(t1) == coefnames(t2) == coefnames(t3)
+            end
+
+            @testset "two-arg" begin
+                f = @formula(y ~ lead(x, 3))
+                sch = schema(f, df)
+                ff = apply_schema(f, sch)
+                t1 = ff.rhs.terms[1]
+                t2 = apply_schema(LeadLagTerm{Term, typeof(lead)}(term(:x), 3), sch)
+                t3 = apply_schema(lead(term(:x), 3), sch)
+
+                @test isequal(modelcols(t1, df), modelcols(t2, df))
+                @test isequal(modelcols(t1, df), modelcols(t3, df))
+                @test coefnames(t1) == coefnames(t2) == coefnames(t3)
+            end
+        end
+
+        
     end
 end