Skip to content

Commit 434acd4

Browse files
committed
Revert "Revert "Allow missings in the modelmatrix, necessary to use lead/lag in @formula (#109)""
This reverts commit f68e456. merged
1 parent 4887c70 commit 434acd4

File tree

4 files changed

+223
-34
lines changed

4 files changed

+223
-34
lines changed

src/fit.jl

Lines changed: 90 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -139,27 +139,15 @@ function reg(
139139
if nobs == N
140140
esample = Colon()
141141
end
142-
143-
# Compute weights
144-
if has_weights
145-
weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
146-
all(isfinite, weights) || throw("Weights are not finite")
147-
else
148-
weights = uweights(nobs)
149-
end
150-
151-
# Compute feM, an AbstractFixedEffectSolver
142+
152143
has_intercept = hasintercept(formula)
153144
has_fe_intercept = false
154145
if has_fes
155146
if any(fe.interaction isa UnitWeights for fe in fes)
156147
has_fe_intercept = true
157148
end
158-
fes = FixedEffect[fe[esample] for fe in fes]
159-
feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}, nthreads)
160149
end
161-
# Compute data for std errors
162-
vcov_method = Vcov.materialize(view(df, esample, :), vcov)
150+
163151
##############################################################################
164152
##
165153
## Dataframe --> Matrix
@@ -171,14 +159,23 @@ function reg(
171159
formula_schema = apply_schema(formula, s, FixedEffectModel, has_fe_intercept)
172160

173161
# Obtain y
174-
# for a Vector{Float64}, convert(Vector{Float64}, y) aliases y
175-
y = convert(Vector{Float64}, response(formula_schema, subdf))
176-
all(isfinite, y) || throw("Some observations for the dependent variable are infinite")
162+
# for a Vector{Float64}, conver(Vector{Float64}, y) aliases y
163+
y = response(formula_schema, subdf)
177164

165+
# added in PR #109 to handle cases where formula terms introduce missings
166+
# to be removed when fixed in StatsModels
167+
esample2 = .!ismissing.(y)
168+
178169
# Obtain X
179-
Xexo = convert(Matrix{Float64}, modelmatrix(formula_schema, subdf))
180-
all(isfinite, Xexo) || throw("Some observations for the exogeneous variables are infinite")
181-
170+
Xexo = modelmatrix(formula_schema, subdf)
171+
172+
# PR #109, to be removed when fixed in StatsModels
173+
if size(Xexo, 2) > 0
174+
for c in eachcol(Xexo)
175+
esample2 .&= .!ismissing.(c)
176+
end
177+
end
178+
182179
response_name, coef_names = coefnames(formula_schema)
183180
if !(coef_names isa Vector)
184181
coef_names = typeof(coef_names)[coef_names]
@@ -187,19 +184,89 @@ function reg(
187184
if has_iv
188185
subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in endo_vars)...))
189186
formula_endo_schema = apply_schema(formula_endo, schema(formula_endo, subdf, contrasts), StatisticalModel)
190-
Xendo = convert(Matrix{Float64}, modelmatrix(formula_endo_schema, subdf))
191-
all(isfinite, Xendo) || throw("Some observations for the endogenous variables are infinite")
187+
Xendo = modelmatrix(formula_endo_schema, subdf)
188+
189+
# PR #109, to be removed when fixed in StatsModels
190+
if size(Xendo, 2) > 0
191+
for c in eachcol(Xendo)
192+
esample2 .&= .!ismissing.(c)
193+
end
194+
end
195+
192196
_, coefendo_names = coefnames(formula_endo_schema)
193197
append!(coef_names, coefendo_names)
194198

195199
subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in iv_vars)...))
196200
formula_iv_schema = apply_schema(formula_iv, schema(formula_iv, subdf, contrasts), StatisticalModel)
197-
Z = convert(Matrix{Float64}, modelmatrix(formula_iv_schema, subdf))
201+
Z = modelmatrix(formula_iv_schema, subdf)
202+
203+
for c in eachcol(Z)
204+
esample2 .&= .!ismissing.(c)
205+
end
206+
207+
# PR #109, to be removed when fixed in StatsModels
208+
if any(esample2 .== false)
209+
Xendo = Xendo[esample2,:]
210+
Z = Z[esample2,:]
211+
end
212+
213+
Xendo = convert(Matrix{Float64}, Xendo)
214+
all(isfinite, Xendo) || throw("Some observations for the endogenous variables are infinite")
215+
216+
Z = convert(Matrix{Float64}, Z)
198217
all(isfinite, Z) || throw("Some observations for the instrumental variables are infinite")
218+
219+
if size(Z, 2) < size(Xendo, 2)
220+
throw("Model not identified. There must be at least as many ivs as endogeneneous variables")
221+
end
199222

200223
# modify formula to use in predict
201224
formula_schema = FormulaTerm(formula_schema.lhs, (tuple(eachterm(formula_schema.rhs)..., (term for term in eachterm(formula_endo_schema.rhs) if term != ConstantTerm(0))...)))
202225
end
226+
227+
esample0 = esample == Colon() ? trues(size(df,1)) : copy(esample)
228+
229+
# PR #109, to be removed when fixed in StatsModels
230+
if any(esample2 .== false)
231+
if any(esample0 .== 0)
232+
throw(ArgumentError("You passed a dataset missing observations and used formula terms that introduce missings. This is not yet supported."))
233+
end
234+
Xexo = Xexo[esample2,:]
235+
y = y[esample2]
236+
esample = copy(esample0)
237+
esample[esample] .= esample2
238+
nobs = sum(esample2)
239+
end
240+
241+
y = convert(Vector{Float64}, y)
242+
all(isfinite, y) || throw("Some observations for the dependent variable are infinite")
243+
244+
Xexo = convert(Matrix{Float64}, Xexo)
245+
all(isfinite, Xexo) || throw("Some observations for the exogeneous variables are infinite")
246+
247+
# Compute weights
248+
if has_weights
249+
weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
250+
else
251+
weights = uweights(nobs)
252+
end
253+
all(isfinite, weights) || throw("Weights are not finite")
254+
sqrtw = sqrt.(weights)
255+
256+
# Compute feM, an AbstractFixedEffectSolver
257+
has_intercept = hasintercept(formula)
258+
has_fe_intercept = false
259+
if has_fes
260+
if any(fe.interaction isa UnitWeights for fe in fes)
261+
has_fe_intercept = true
262+
end
263+
fes = FixedEffect[fe[esample] for fe in fes]
264+
feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}, nthreads)
265+
end
266+
# Compute data for std errors
267+
vcov_method = Vcov.materialize(view(df, esample, :), vcov)
268+
269+
203270
# compute tss now before potentially demeaning y
204271
tss_total = tss(y, has_intercept | has_fe_intercept, weights)
205272
# create unitilaized

src/partial_out.jl

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,52 @@ function partial_out(
6565
fes, ids, ids_fes, formula = parse_fixedeffect(df, formula)
6666
has_fes = !isempty(fes)
6767

68-
6968
nobs = sum(esample)
7069
(nobs > 0) || throw("sample is empty")
70+
71+
if has_fes
72+
# in case some FixedEffect does not have interaction, remove the intercept
73+
if any(isa(fe.interaction, UnitWeights) for fe in fes)
74+
formula = FormulaTerm(formula.lhs, tuple(ConstantTerm(0), (t for t in eachterm(formula.rhs) if t!= ConstantTerm(1))...))
75+
has_fes_intercept = true
76+
end
77+
end
78+
79+
# Extract Y
80+
vars = unique(StatsModels.termvars(formula))
81+
subdf = Tables.columntable(disallowmissing!(df[esample, vars]))
82+
formula_y = FormulaTerm(ConstantTerm(0), (ConstantTerm(0), eachterm(formula.lhs)...))
83+
formula_y_schema = apply_schema(formula_y, schema(formula_y, subdf, contrasts), StatisticalModel)
84+
Y = modelmatrix(formula_y_schema, subdf)
85+
86+
# Extract X
87+
formula_x = FormulaTerm(ConstantTerm(0), formula.rhs)
88+
formula_x_schema = apply_schema(formula_x, schema(formula_x, subdf, contrasts), StatisticalModel)
89+
X = modelmatrix(formula_x_schema, subdf)
90+
91+
# added in PR #109 to handle cases where formula terms introduce missings
92+
# to be removed when fixed in StatsModels
93+
esample2 = trues(size(Y, 1))
94+
for c in eachcol(Y)
95+
esample2 .&= .!ismissing.(c)
96+
end
97+
if size(X, 2) > 0 # X can have zero rows if all regressors are fixed effects
98+
for c in eachcol(X)
99+
esample2 .&= .!ismissing.(c)
100+
end
101+
end
102+
103+
if any(!, esample2)
104+
esample = esample2
105+
Y = Y[esample,:]
106+
X = X[esample,:]
107+
nobs = sum(esample)
108+
end
109+
110+
# Disallow missings
111+
Y = convert(Matrix{Float64}, Y)
112+
X = convert(Matrix{Float64}, X)
113+
71114
# Compute weights
72115
if has_weights
73116
weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
@@ -85,14 +128,8 @@ function partial_out(
85128
fes = FixedEffect[fe[esample] for fe in fes]
86129
feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method})
87130
end
88-
131+
89132
# Compute residualized Y
90-
vars = unique(StatsModels.termvars(formula))
91-
subdf = Tables.columntable(disallowmissing!(df[esample, vars]))
92-
formula_y = FormulaTerm(ConstantTerm(0), (ConstantTerm(0), eachterm(formula.lhs)...))
93-
formula_y_schema = apply_schema(formula_y, schema(formula_y, subdf, contrasts), StatisticalModel)
94-
Y = convert(Matrix{Float64}, modelmatrix(formula_y_schema, subdf))
95-
96133
ynames = coefnames(formula_y_schema)[2]
97134
if !isa(ynames, Vector)
98135
ynames = [ynames]
@@ -107,9 +144,6 @@ function partial_out(
107144
end
108145

109146
# Compute residualized X
110-
formula_x = FormulaTerm(ConstantTerm(0), formula.rhs)
111-
formula_x_schema = apply_schema(formula_x, schema(formula_x, subdf, contrasts), StatisticalModel)
112-
X = convert(Matrix{Float64}, modelmatrix(formula_x_schema, subdf))
113147
if has_fes
114148
X, b, c = solve_residuals!(X, feM; maxiter = maxiter, tol = tol, progress_bar = false)
115149
append!(iterations, b)

test/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
66
FixedEffects = "c8885935-8500-56a7-9867-7708b20db0eb"
77
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
88
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
9+
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
910
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
1011

1112
[compat]

test/fit.jl

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,93 @@ df.Price_zero = copy(df.Price)
228228
df.Price_zero[1] = 0.0
229229
m = @formula Sales ~ log(Price_zero)
230230
@test_throws "Some observations for the regressor are infinite" reg(df, m)
231+
232+
# function that introduces missing
233+
using StatsModels: lag
234+
df.id1 = df.State
235+
df.y = df.Sales
236+
df.x1 = df.Price
237+
df.z1 = df.Pimin
238+
df.w = df.Pop
239+
df.x1_lagged = lag(df.x1)
240+
df.z1_lagged = lag(df.z1)
241+
df.y_lagged = lag(df.y)
242+
243+
244+
df.x1_m1 = Array{Union{Float64,Missing}}(copy(df.x1))
245+
df.x1_m1[end-20:end] .= missing
246+
247+
df.x1_m2 = Array{Union{Float64,Missing}}(copy(df.x1))
248+
df.x1_m2[5:10:end] .= missing
249+
250+
df.x1_lagged = lag(df.x1)
251+
df.z1_lagged = lag(df.z1)
252+
253+
254+
function test_lags(m0, m1, descr)
255+
@testset "$descr" begin
256+
x0 = reg(df, m0, Vcov.cluster(:id1), weights=:w)
257+
x1 = reg(df, m1, Vcov.cluster(:id1), weights=:w)
258+
259+
@test x0.residuals == x1.residuals
260+
@test x0.coef == x1.coef
261+
@test x0.nobs == x1.nobs
262+
@test x0.vcov == x1.vcov
263+
end
264+
end
265+
266+
function test_lags_broken(m0, m1, descr)
267+
@testset "$descr" begin
268+
x0 = reg(df, m0, Vcov.cluster(:id1), weights=:w)
269+
@test_throws ArgumentError reg(df, m1, Vcov.cluster(:id1), weights=:w)
270+
271+
#@test_ x0.coef != x1.coef
272+
#@test_ x0.vcov != x1.vcov
273+
end
274+
end
275+
276+
# NOTE: This is a "dumb" lag function,
277+
# it doesn't take into account time and group indices!
278+
@testset "missings from @formula" begin
279+
m0 = @formula y ~ x1_lagged + fe(id1)
280+
m1 = @formula y ~ lag(x1) + fe(id1)
281+
test_lags(m0, m1, "ols: _ ~ lag")
282+
283+
m0 = @formula y_lagged ~ x1 + fe(id1)
284+
m1 = @formula lag(y) ~ x1 + fe(id1)
285+
test_lags(m0, m1, "ols: lag ~ _")
286+
287+
m0 = @formula y ~ (x1_lagged ~ z1_lagged) + fe(id1)
288+
m1 = @formula y ~ (lag(x1) ~ lag(z1)) + fe(id1)
289+
test_lags(m0, m1, "iv: _ ~ (lag ~ lag)")
290+
291+
m0 = @formula y ~ (x1_lagged ~ z1) + fe(id1)
292+
m1 = @formula y ~ (lag(x1) ~ z1) + fe(id1)
293+
test_lags(m0, m1, "iv: _ ~ (lag ~ _)")
294+
295+
m0 = @formula y ~ (x1 ~ z1_lagged) + fe(id1)
296+
m1 = @formula y ~ (x1 ~ lag(z1)) + fe(id1)
297+
test_lags(m0, m1, "iv: _ ~ (_ ~ lag)")
298+
299+
# NOTE: The case where the df contains missings and the formula contains missings cannot be handled yet. The case with :x1_m1 would actually work, but the case with :x1_m2 would not. This because the missings in x1_m1 and x1_m2 are removed BEFORE the the lag is applied.
300+
301+
m0 = @formula y_lagged ~ x1_m1 + fe(id1)
302+
m1 = @formula lag(y) ~ x1_m1 + fe(id1)
303+
test_lags_broken(m0, m1, "ols: lag ~ _, with missings")
304+
305+
m0 = @formula y_lagged ~ x1_m2 + fe(id1)
306+
m1 = @formula lag(y) ~ x1_m2 + fe(id1)
307+
test_lags_broken(m0, m1, "ols: lag ~ _, with missings")
308+
309+
m0 = @formula y ~ (x1_m1 ~ z1_lagged) + fe(id1)
310+
m1 = @formula y ~ (x1_m1 ~ lag(z1)) + fe(id1)
311+
test_lags_broken(m0, m1, "iv: _ ~ (_ ~ lag), with missings")
312+
313+
m0 = @formula y ~ (x1_m2 ~ z1_lagged) + fe(id1)
314+
m1 = @formula y ~ (x1_m2 ~ lag(z1)) + fe(id1)
315+
test_lags_broken(m0, m1, "iv: _ ~ (_ ~ lag), with missings")
316+
end
317+
231318
##############################################################################
232319
##
233320
## collinearity

0 commit comments

Comments
 (0)