Switch to Feather/Artifacts for test data (#238)

dmbates · palday · web-flow · commit d239959775ec · 2020-02-05T14:30:11.000-06:00
* Switch to Feather/Artifacts for test data

* dat.rda is no longer needed for tests

* Set compat range for `Feather` package

* fix sha256 entry

* consistency in helper functions across tests

* fix remnant of merge

Co-authored-by: Phillip Alday &lt;palday@users.noreply.github.com&gt;
diff --git a/Artifacts.toml b/Artifacts.toml
@@ -0,0 +1,7 @@
+[TestData]
+git-tree-sha1 = "9d575764bc1c1a7860c34c5b153251e5f2ee6704"
+lazy = true
+
+    [[TestData.download]]
+    sha256 = "0b63ae3e9e457ee4b33482d3bf8cc7f20c8ed7c8b2c863af311ba0944c6d46e4"
+    url = "https://ndownloader.figshare.com/files/21085968"
diff --git a/Project.toml b/Project.toml
@@ -11,6 +11,7 @@ GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
 NamedArrays = "86f7a689-2022-50b4-a561-43c23ac3c673"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -43,8 +44,8 @@ julia = "1.3"
 
 [extras]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
+Feather = "becb17da-46f6-5d3c-ad1b-1c5fe96bc73c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "DataFrames", "RData"]
+test = ["Test", "DataFrames", "Feather"]
diff --git a/src/MixedModels.jl b/src/MixedModels.jl
@@ -8,6 +8,7 @@ using LinearAlgebra
 using NamedArrays
 using NLopt
 using Random
+using Pkg.Artifacts
 using ProgressMeter
 using Showoff
 using SparseArrays
@@ -52,6 +53,7 @@ export @formula,
        RandomEffectsTerm,
        ReMat,
        SqrtLink,
+       TestData,
        UniformBlockDiagonal,
        VarCorr,
        aic,
@@ -107,6 +109,10 @@ import Base: ==, *
 
 abstract type MixedModel{T} <: StatsModels.RegressionModel end # model with fixed and random effects
 
+function __init__()
+    global TestData = artifact"TestData"
+end
+
 include("utilities.jl")
 include("arraytypes.jl")
 include("varcorr.jl")
diff --git a/test/FactorReTerm.jl b/test/FactorReTerm.jl
@@ -1,19 +1,24 @@
-using DataFrames, LinearAlgebra, MixedModels, Random, RData, SparseArrays, StatsModels, Test
-
-if !@isdefined(dat) || !isa(dat, Dict{Symbol, DataFrame})
-    const dat = Dict(Symbol(k) => v for (k, v) in
-        load(joinpath(dirname(pathof(MixedModels)), "..", "test", "dat.rda")))
-end
+using DataFrames
+using Feather
+using LinearAlgebra
+using MixedModels
+using Random
+using SparseArrays
+using StatsModels
+using Test
 
 const LMM = LinearMixedModel
 
+data(nm::AbstractString) = Feather.read(joinpath(MixedModels.TestData, string(nm, ".feather")))
+data(nm::Symbol) = data(string(nm))
+
 @testset "scalarReMat" begin
-    ds = dat[:Dyestuff]
-    f1 = @formula(Y ~ 1 + (1|G))
+    ds = data("dyestuff")
+    f1 = @formula(yield ~ 1 + (1|batch))
     y1, Xs1 = modelcols(apply_schema(f1, schema(ds), LMM), ds)
     sf = Xs1[2]
-    psts = dat[:Pastes]
-    f2 = @formula(Y ~ 1 + (1|G) + (1|H))
+    psts = data("pastes")
+    f2 = @formula(strength ~ 1 + (1|sample) + (1|batch))
     y2, Xs2 = modelcols(apply_schema(f2, schema(psts), LMM), psts)
     sf1 = Xs2[2]
     sf2 = Xs2[3]
@@ -78,36 +83,36 @@ const LMM = LinearMixedModel
 end
 
 @testset "RandomEffectsTerm" begin
-    slp = dat[:sleepstudy]
+    slp = data("sleepstudy")
     contrasts =  Dict{Symbol,Any}()
 
     @testset "Detect same variable as blocking and experimental" begin
-        f = @formula(Y ~ 1 + (1 + G|G))
+        f = @formula(reaction ~ 1 + (1 + subj|subj))
         @test_throws ArgumentError apply_schema(f, schema(f, slp, contrasts), LinearMixedModel)
     end
 
     @testset "Detect both blocking and experimental variables" begin
         # note that U is not in the fixed effects because we want to make square
         # that we're detecting all the variables in the random effects
-        f = @formula(Y ~ 1 + (1 + U|G))
+        f = @formula(reaction ~ 1 + (1 + days|subj))
         form = apply_schema(f, schema(f, slp, contrasts), LinearMixedModel)
-        @test StatsModels.termvars(form.rhs) == [:U, :G]
+        @test StatsModels.termvars(form.rhs) == [:days, :subj]
     end
 end
 
 @testset "Categorical Blocking Variable" begin
     # deepcopy because we're going to modify it
-    slp = deepcopy(dat[:sleepstudy])
+    slp = deepcopy(data("sleepstudy"))
     contrasts =  Dict{Symbol,Any}()
-    f = @formula(Y ~ 1 + (1|G))
+    f = @formula(reaction ~ 1 + (1|subj))
 
     # String blocking-variables work fine because StatsModels is smart enough to
     # treat strings as Categorical. Note however that this is a
     # far less efficient to store the original dataframe, although it doesn't
     # matter for the contrast matrix
-    slp[!,:G] = convert.(String, slp[!, :G])
+    slp[!,:subj] = convert.(String, slp[!, :subj])
     # @test_throws ArgumentError LinearMixedModel(f, slp)
-    slp[!,:G] = parse.(Int, slp[!, :G])
+    slp.subj = parse.(Int, getindex.(slp.subj, Ref(2:4)))
     @test_throws ArgumentError LinearMixedModel(f, slp)
 end
 
@@ -169,5 +174,6 @@ end
         @test modelcols(last(ff.rhs), dat) == float(Matrix(I, 18, 18))
 
         @test_broken fit(MixedModel, @formula(Y ~ 1 + (1|H/c)), dat[:Pastes])
+
     end
 end
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,9 +1,9 @@
 [deps]
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Feather = "becb17da-46f6-5d3c-ad1b-1c5fe96bc73c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NamedArrays = "86f7a689-2022-50b4-a561-43c23ac3c673"
-RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -12,4 +12,4 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
-RData = "0.5, 0.6, 0.7"
+Feather = "0.5"
diff --git a/test/UniformBlockDiagonal.jl b/test/UniformBlockDiagonal.jl
@@ -1,9 +1,14 @@
-using DataFrames, LinearAlgebra, MixedModels, Random, RData, SparseArrays, StatsModels, Test
+using DataFrames
+using Feather
+using LinearAlgebra
+using MixedModels
+using Random
+using SparseArrays
+using StatsModels
+using Test
 
-if !@isdefined(dat) || !isa(dat, Dict{Symbol, DataFrame})
-    const dat = Dict(Symbol(k) => v for (k, v) in
-        load(joinpath(dirname(pathof(MixedModels)), "..", "test", "dat.rda")))
-end
+dat(nm::AbstractString) = Feather.read(joinpath(MixedModels.TestData, nm * ".feather"))
+dat(nm::Symbol) = dat(string(nm))
 
 const LMM = LinearMixedModel
 
@@ -54,10 +59,10 @@ const LMM = LinearMixedModel
 
     @testset "updateL" begin
         @test ones(2, 2) == MixedModels.rankUpdate!(Hermitian(zeros(2, 2)), ones(2))
-        d3 = dat[:d3]
+        d3 = dat(:d3)
         sch = schema(d3)
-        vf1 = modelcols(apply_schema(@formula(Y ~ 1 + U + (1+U|G)), sch, LMM), d3)[2][2]
-        vf2 = modelcols(apply_schema(@formula(Y ~ 1 + U + (1+U|H)), sch, LMM), d3)[2][2]
+        vf1 = modelcols(apply_schema(@formula(y ~ 1 + u + (1+u|g)), sch, LMM), d3)[2][2]
+        vf2 = modelcols(apply_schema(@formula(y ~ 1 + u + (1+u|h)), sch, LMM), d3)[2][2]
         @test vf1.λ == LowerTriangular(Matrix(I, 2, 2))
         setθ!(vf2, [1.75, 0.0, 1.0])
         A11 = vf1'vf1
diff --git a/test/dat.rda b/test/dat.rda
diff --git a/test/fit.jl b/test/fit.jl
@@ -1,32 +1,31 @@
-using MixedModels, RData, Test
+using MixedModels, Feather, Test
 
-if !@isdefined(dat) || !isa(dat, Dict{Symbol, DataFrame})
-    const dat = Dict(Symbol(k) => v for (k, v) in
-        load(joinpath(dirname(pathof(MixedModels)), "..", "test", "dat.rda")))
-end
+data(nm::AbstractString) = Feather.read(joinpath(MixedModels.TestData, string(nm, ".feather")))
+
+data(nm::Symbol) = data(string(nm))
 
 @testset "linear" begin
-    m1 = fit(MixedModel, @formula(Y ~ 1 + (1|G)), dat[:Dyestuff])
+    m1 = fit(MixedModel, @formula(yield ~ 1 + (1|batch)), data(:dyestuff))
     @test first(m1.θ) ≈ 0.7525806757718846 rtol=1.0e-5
 end
 
 @testset "generalized" begin
-    gm1 = fit(MixedModel, @formula(use ~ 1 + urb + l + a + abs2(a) + (1|d)),
-              dat[:Contraception], Bernoulli())
+    gm1 = fit(MixedModel, @formula(use ~ 1 + urban + livch + age + abs2(age) + (1|dist)),
+              data(:contra), Bernoulli())
     @test deviance(gm1) ≈ 2372.7286 atol=1.0e-3
 end
 
 @testset "Normal-IdentityLink" begin
-    @test isa(fit(MixedModel, @formula(Y ~ 1 + (1|G)), dat[:Dyestuff], Normal()),
+    @test isa(fit(MixedModel, @formula(yield ~ 1 + (1|batch)), data(:dyestuff), Normal()),
               LinearMixedModel)
     @test_throws(ArgumentError("use LinearMixedModel for Normal distribution with IdentityLink"),
                  fit(GeneralizedLinearMixedModel,
-                     @formula(Y ~ 1 + (1|G)),
-                     dat[:Dyestuff]))
+                     @formula(yield ~ 1 + (1|batch)),
+                     data(:dyestuff)))
 end
 
 @testset "Normal Distribution GLMM" begin
-    @test isa(fit(MixedModel, @formula(Y ~ 1 + (1|G)), dat[:Dyestuff],
-                         Normal(), SqrtLink()),
-                     GeneralizedLinearMixedModel)
+    @test_broken(isa(fit(MixedModel, @formula(yield ~ 1 + (1|batch)), data(:dyestuff),
+                         Normal(), LogLink),
+                     GeneralizedLinearMixedModel))
 end
diff --git a/test/missing.jl b/test/missing.jl
@@ -1,14 +1,11 @@
-using MixedModels, RData, Test
-
-if !@isdefined(dat) || !isa(dat, Dict{Symbol, DataFrame})
-    const dat = Dict(Symbol(k) => v for (k, v) in
-        load(joinpath(dirname(pathof(MixedModels)), "..", "test", "dat.rda")))
-end
+using Feather, MixedModels, Test
 
+dat(nm::AbstractString) = Feather.read(joinpath(MixedModels.TestData, string(nm, ".feather")))
+dat(nm::Symbol) = dat(string(nm))
 # deepcopy because we're going to modify it
-slp = deepcopy(dat[:sleepstudy])
-slp[!,:U] = Array{Union{Missing, Float64},1}(slp[!,:U])
-slp[1,:U] = missing
+slp = deepcopy(dat(:sleepstudy))
+slp[!,:days] = Array{Union{Missing, Float64},1}(slp[!,:days])
+slp[1,:days] = missing
 
 # TODO: re-enable this test when better missing support has landed in StatsModels
 # @testset "No impact from missing on schema" begin
@@ -24,14 +21,14 @@ slp[1,:U] = missing
 @testset "Missing Omit" begin
     @testset "Missing from unused variables" begin
         # missing from unused variables should have no impact
-        m1 = fit(MixedModel, @formula(Y ~ 1 + (1|G)), dat[:sleepstudy])
-        m1_missing = fit(MixedModel, @formula(Y ~ 1 + (1|G)), slp)
+        m1 = fit(MixedModel, @formula(reaction ~ 1 + (1|subj)), dat(:sleepstudy))
+        m1_missing = fit(MixedModel, @formula(reaction ~ 1 + (1|subj)), slp)
         @test isapprox(m1.θ, m1_missing.θ, rtol=1.0e-12)
     end
 
     @testset "Missing from used variables" begin
-        m1 = fit(MixedModel, @formula(Y ~ 1 + U + (1|G)), dat[:sleepstudy])
-        m1_missing = fit(MixedModel, @formula(Y ~ 1 + U + (1|G)), slp)
+        m1 = fit(MixedModel, @formula(reaction ~ 1 + days + (1|subj)), dat(:sleepstudy))
+        m1_missing = fit(MixedModel, @formula(reaction ~ 1 + days + (1|subj)), slp)
         @test nobs(m1) - nobs(m1_missing) == 1
     end
 end
diff --git a/test/pirls.jl b/test/pirls.jl
@@ -1,13 +1,11 @@
-using DataFrames, LinearAlgebra, MixedModels, RData, Test
-if !@isdefined(dat) || !isa(dat, Dict{Symbol, DataFrame})
-    const dat = Dict(Symbol(k) => v for (k, v) in
-        load(joinpath(dirname(pathof(MixedModels)), "..", "test", "dat.rda")))
-end
+using DataFrames, Feather, LinearAlgebra, MixedModels, Test
+
+dat(nm::AbstractString) = Feather.read(joinpath(MixedModels.TestData, string(nm, ".feather")))
+dat(nm::Symbol) = dat(string(nm))
 
 @testset "contra" begin
-    contra = dat[:Contraception]
-    contra[!, :urbdist] = categorical(string.(contra[!, :d], contra[!, :urb]))
-    contraform = @formula(use ~ 1+a+abs2(a)+urb+l+(1|urbdist))
+    contra = dat(:contra)
+    contraform = @formula(use ~ 1+age+abs2(age)+urban+livch+(1|urbdist))
     gm0 = fit(MixedModel, contraform, contra, Bernoulli(), fast=true);
     @test gm0.lowerbd == zeros(1)
     @test isapprox(gm0.θ, [0.5720734451352923], atol=0.001)
@@ -28,7 +26,7 @@ end
     @test isnan(gm1.σ)
     @test length(gm1.y) == size(gm1.X, 1)
     @test :θ in propertynames(gm0)
-    gm0.βθ = vcat(gm0.β, gm0.theta)
+    # gm0.βθ = vcat(gm0.β, gm0.theta)
     # the next three values are not well defined in the optimization
     #@test isapprox(logdet(gm1), 75.7217, atol=0.1)
     #@test isapprox(sum(abs2, gm1.u[1]), 48.4747, atol=0.1)
@@ -37,21 +35,20 @@ end
 end
 
 @testset "cbpp" begin
-    cbpp = dat[:cbpp]
-    cbpp[!, :prop] = cbpp[!, :i] ./ cbpp[!, :s]
-    gm2 = fit(MixedModel, @formula(prop ~ 1 + p + (1|h)), cbpp, Binomial(), wts=cbpp[!,:s])
-    @test isapprox(deviance(gm2,true), 100.09585619892968, atol=0.0001)
-    @test isapprox(sum(abs2, gm2.u[1]), 9.723054788538546, atol=0.0001)
-    @test isapprox(logdet(gm2), 16.90105378801136, atol=0.0001)
+    cbpp = dat(:cbpp)
+    gm2 = fit(MixedModel, @formula((incid/hsz) ~ 1 + period + (1|herd)), cbpp, Binomial(), wts=float(cbpp.hsz))
+    @test deviance(gm2,true) ≈ 100.09585619892968 atol=0.0001
+    @test sum(abs2, gm2.u[1]) ≈ 9.723054788538546 atol=0.0001
+    @test logdet(gm2) ≈ 16.90105378801136 atol=0.0001
     @test isapprox(sum(gm2.resp.devresid), 73.47174762237978, atol=0.001)
     @test isapprox(loglikelihood(gm2), -92.02628186840045, atol=0.001)
     @test isnan(sdest(gm2))
     @test varest(gm2) == 1
 end
 
 @testset "verbagg" begin
-    gm3 = fit(MixedModel, @formula(r2 ~ 1 + a + g + b + s + (1|id)+(1|item)), dat[:VerbAgg],
-         Bernoulli())
+    gm3 = fit(MixedModel, @formula(r2 ~ 1+anger+gender+btype+situ+(1|subj)+(1|item)),
+        dat(:verbagg), Bernoulli())
     @test deviance(gm3) ≈ 8151.40 rtol=1e-5
     @test lowerbd(gm3) == vcat(fill(-Inf, 6), zeros(2))
     @test fitted(gm3) == predict(gm3)
@@ -61,8 +58,11 @@ end
 end
 
 @testset "grouseticks" begin
-    gm4 = fit(MixedModel, @formula(t ~ 1 + y + ch + (1|i) + (1|b) + (1|l)),
-              dat[:grouseticks], Poisson(), fast=true)  # fails in pirls! with fast=false
+    center(v::AbstractVector) = v .- (sum(v) / length(v))
+    grouseticks = dat(:grouseticks)
+    grouseticks.ch = center(grouseticks.height)
+    gm4 = fit(MixedModel, @formula(ticks ~ 1+year+ch+ (1|index) + (1|brood) + (1|location)),
+        grouseticks, Poisson(), fast=true)  # fails in pirls! with fast=false
     @test isapprox(deviance(gm4), 851.4046, atol=0.001)
     # these two values are not well defined at the optimum
     #@test isapprox(sum(x -> sum(abs2, x), gm4.u), 196.8695297987013, atol=0.1)
diff --git a/test/pls.jl b/test/pls.jl
diff --git a/test/runtests.jl b/test/runtests.jl