Skip to content

Commit 96e03fc

Browse files
committed
Merge pull request #743 from pymc-devs/missingmaster2
Missing values imputation implementation
2 parents e482d0f + 469ec7b commit 96e03fc

File tree

10 files changed

+502
-23
lines changed

10 files changed

+502
-23
lines changed

pymc3/distributions/distribution.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import numpy as np
33
from ..model import Model
44

5-
__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Discrete']
5+
__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Discrete', 'NoDistribution', 'TensorType']
66

77

88
class Distribution(object):
@@ -69,6 +69,10 @@ def getattr_value(self, val):
6969
def TensorType(dtype, shape):
7070
return t.TensorType(str(dtype), np.atleast_1d(shape) == 1)
7171

72+
class NoDistribution(Distribution):
73+
def logp(self, x):
74+
return 0
75+
7276
class Discrete(Distribution):
7377
"""Base class for discrete distributions"""
7478
def __init__(self, shape=(), dtype='int64', defaults=['mode'], *args, **kwargs):

pymc3/examples/arbitrary_stochastic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
def logp(failure, value):
1010
return sum(failure * log(lam) - lam * value)
1111

12-
x = DensityDist('x', logp, observed=(failure, value))
12+
x = DensityDist('x', logp, observed={'failure':failure, 'value':value})
1313

1414

1515
def run (n=3000):
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
score,male,sib,synd_or_disab,age_test,mother_hs,ident_by_3
2+
100,0,1,0,5,1,0
3+
35,0,2,0,9,0,1
4+
24,1,NaN,1,7,1,0
5+
75,1,0,0,2,1,1
6+
24,1,NaN,1,7,1,0
7+
81,1,0,0,2,1,1
8+
24,1,3,1,7,1,0
9+
64,1,0,NaN,11,1,0
10+
18,1,1,1,9,1,0
11+
24,1,1,0,8,0,0
12+
24,1,1,1,7,1,0
13+
76,0,2,0,2,NaN,0
14+
36,0,1,0,2,1,1
15+
16,1,1,0,2,0,0
16+
95,0,0,0,2,NaN,1
17+
65,1,2,0,2,0,1

pymc3/examples/disaster_model.ipynb

Lines changed: 199 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""
2+
A model for coal mining disasters data with a changepoint
3+
4+
switchpoint ~ U(0, 110)
5+
early_mean ~ Exp(1.)
6+
late_mean ~ Exp(1.)
7+
disasters[t] ~ Po(early_mean if t <= switchpoint, late_mean otherwise)
8+
9+
"""
10+
11+
from pymc3 import *
12+
13+
import theano.tensor as t
14+
from numpy import arange, array, ones, concatenate
15+
from numpy.random import randint
16+
from numpy.ma import masked_values
17+
18+
__all__ = ['disasters_data', 'switchpoint', 'early_mean', 'late_mean', 'rate',
19+
'disasters']
20+
21+
# Time series of recorded coal mining disasters in the UK from 1851 to 1962
22+
disasters_data = array([4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,
23+
3, 3, 5, 4, 5, 3, 1, -1, 4, 1, 5, 5, 3, 4, 2, 5,
24+
2, 2, 3, 4, 2, 1, 3, 2, 2, 1, 1, 1, 1, 3, 0, 0,
25+
1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,
26+
0, 1, 0, 1, -1, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,
27+
3, 3, 1, 1, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,
28+
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])
29+
years = len(disasters_data)
30+
31+
masked_values = masked_values(disasters_data, value=-1)
32+
33+
with Model() as model:
34+
35+
# Prior for distribution of switchpoint location
36+
switchpoint = DiscreteUniform('switchpoint', lower=0, upper=years)
37+
# Priors for pre- and post-switch mean number of disasters
38+
early_mean = Exponential('early_mean', lam=1.)
39+
late_mean = Exponential('late_mean', lam=1.)
40+
41+
# Allocate appropriate Poisson rates to years before and after current
42+
# switchpoint location
43+
idx = arange(years)
44+
rate = switch(switchpoint >= idx, early_mean, late_mean)
45+
46+
# Data likelihood
47+
disasters = Poisson('disasters', rate, observed=masked_values)
48+
49+
50+
def run(n=1000):
51+
if n == "short":
52+
n = 500
53+
with model:
54+
55+
# Initial values for stochastic nodes
56+
start = {'early_mean': 2., 'late_mean': 3.}
57+
58+
# Use slice sampler for means
59+
step1 = Slice([early_mean, late_mean])
60+
# Use Metropolis for switchpoint, since it accomodates discrete variables
61+
step2 = Metropolis([switchpoint, disasters.missing_values ])
62+
63+
tr = sample(n, tune=500, start=start, step=[step1, step2])
64+
65+
summary(tr, vars=['disasters_missing'])
66+
67+
if __name__ == '__main__':
68+
run()

pymc3/examples/lasso_missing.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from pymc3 import *
2+
import numpy as np
3+
import pandas as pd
4+
from numpy.ma import masked_values
5+
6+
# Import data, filling missing values with sentinels (-999)
7+
test_scores = pd.read_csv(get_data_file('pymc3.examples', 'data/test_scores.csv')).fillna(-999)
8+
9+
# Extract variables: test score, gender, number of siblings, previous disability, age,
10+
# mother with HS education or better, hearing loss identified by 3 months of age
11+
(score, male, siblings, disability,
12+
age, mother_hs, early_ident) = test_scores[['score', 'male', 'sib',
13+
'synd_or_disab', 'age_test',
14+
'mother_hs', 'ident_by_3']].astype(float).values.T
15+
16+
with Model() as model:
17+
18+
# Impute missing values
19+
sib_mean = Exponential('sib_mean', 1)
20+
siblings_imp = Poisson('siblings_imp', sib_mean, observed=masked_values(siblings, value=-999))
21+
22+
p_disab = Beta('p_disab', 1, 1)
23+
disability_imp = Bernoulli('disability_imp', p_disab, observed=masked_values(disability, value=-999))
24+
25+
p_mother = Beta('p_mother', 1, 1)
26+
mother_imp = Bernoulli('mother_imp', p_mother, observed=masked_values(mother_hs, value=-999))
27+
28+
s = HalfCauchy('s', 5, testval=5)
29+
beta = Laplace('beta', 0, 100, shape=7, testval=.1)
30+
31+
expected_score = (beta[0] + beta[1]*male + beta[2]*siblings_imp + beta[3]*disability_imp +
32+
beta[4]*age + beta[5]*mother_imp + beta[6]*early_ident)
33+
34+
observed_score = Normal('observed_score', expected_score, s ** -2, observed=score)
35+
36+
37+
with model:
38+
start = find_MAP()
39+
step1 = NUTS([beta, s, p_disab, p_mother, sib_mean], scaling=start)
40+
41+
step2 = Metropolis([mother_imp.missing_values,
42+
disability_imp.missing_values,
43+
siblings_imp.missing_values])
44+
45+
def run(n=5000):
46+
if n == 'short':
47+
n = 100
48+
with model:
49+
trace = sample(n, [step1, step2], start)
50+
51+
52+
if __name__ == '__main__':
53+
run()

pymc3/model.py

Lines changed: 89 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ def __init__(self):
9090
self.observed_RVs = []
9191
self.deterministics = []
9292
self.potentials = []
93+
self.missing_values = []
9394
self.model = self
9495

9596
@property
@@ -101,7 +102,7 @@ def logpt(self):
101102

102103
@property
103104
def vars(self):
104-
"""List of unobserved random variables the model is defined in terms of (which excludes deterministics)."""
105+
"""List of unobserved random variables used as inputs to the model (which excludes deterministics)."""
105106
return self.free_RVs
106107

107108
@property
@@ -112,7 +113,7 @@ def basic_RVs(self):
112113
@property
113114
def unobserved_RVs(self):
114115
"""List of all random variable, including deterministic ones."""
115-
return self.free_RVs + self.deterministics
116+
return self.vars + self.deterministics
116117

117118

118119
@property
@@ -147,9 +148,19 @@ def Var(self, name, dist, data=None):
147148
var = TransformedRV(name=name, distribution=dist, model=self, transform=dist.transform)
148149
self.deterministics.append(var)
149150
return var
150-
else:
151+
elif isinstance(data, dict):
152+
var = MultiObservedRV(name=name, data=data, distribution=dist, model=self)
153+
self.observed_RVs.append(var)
154+
if var.missing_values:
155+
self.free_RVs += var.missing_values
156+
self.missing_values += var.missing_values
157+
else:
151158
var = ObservedRV(name=name, data=data, distribution=dist, model=self)
152159
self.observed_RVs.append(var)
160+
if var.missing_values:
161+
self.free_RVs.append(var.missing_values)
162+
self.missing_values.append(var.missing_values)
163+
153164
self.add_random_variable(var)
154165
return var
155166

@@ -342,8 +353,78 @@ def __init__(self, type=None, owner=None, index=None, name=None, distribution=No
342353
self.logp_elemwiset = distribution.logp(self)
343354
self.model = model
344355

345-
class ObservedRV(Factor):
346-
"""Observed random variable that a model is specified in terms of."""
356+
def pandas_to_array(data):
357+
if hasattr(data, 'values'): #pandas
358+
if data.isnull().any().any(): #missing values
359+
return np.ma.MaskedArray(data.values, data.isnull().values)
360+
else:
361+
return data.values
362+
elif hasattr(data, 'mask'):
363+
return data
364+
elif isinstance(data, theano.gof.graph.Variable):
365+
return data
366+
else:
367+
return np.asarray(data)
368+
369+
370+
def as_tensor(data, name,model, dtype):
371+
data = pandas_to_array(data).astype(dtype)
372+
373+
if hasattr(data, 'mask'):
374+
from .distributions import NoDistribution
375+
fakedist = NoDistribution.dist(shape=data.mask.sum(), dtype=dtype, testval=data.mean().astype(dtype))
376+
missing_values = FreeRV(name=name + '_missing', distribution=fakedist, model=model)
377+
378+
constant = t.as_tensor_variable(data.filled())
379+
380+
dataTensor = theano.tensor.set_subtensor(constant[data.mask.nonzero()], missing_values)
381+
dataTensor.missing_values = missing_values
382+
return dataTensor
383+
else:
384+
data = t.as_tensor_variable(data, name=name)
385+
data.missing_values = None
386+
return data
387+
388+
class ObservedRV(Factor, TensorVariable):
389+
"""Observed random variable that a model is specified in terms of.
390+
Potentially partially observed.
391+
"""
392+
def __init__(self, type=None, owner=None, index=None, name=None, data=None, distribution=None, model=None):
393+
"""
394+
Parameters
395+
----------
396+
397+
type : theano type (optional)
398+
owner : theano owner (optional)
399+
400+
name : str
401+
distribution : Distribution
402+
model : Model
403+
"""
404+
from .distributions import TensorType
405+
if type is None:
406+
data = pandas_to_array(data)
407+
type = TensorType(distribution.dtype, data.shape)
408+
409+
super(TensorVariable, self).__init__(type, None, None, name)
410+
411+
if distribution is not None:
412+
data = as_tensor(data, name,model,distribution.dtype)
413+
self.missing_values = data.missing_values
414+
415+
self.logp_elemwiset = distribution.logp(data)
416+
self.model = model
417+
self.distribution = distribution
418+
419+
#make this RV a view on the combined missing/nonmissing array
420+
theano.gof.Apply(theano.compile.view_op, inputs=[data], outputs=[self])
421+
422+
self.tag.test_value = theano.compile.view_op(data).tag.test_value
423+
424+
class MultiObservedRV(Factor):
425+
"""Observed random variable that a model is specified in terms of.
426+
Potentially partially observed.
427+
"""
347428
def __init__(self, name, data, distribution, model):
348429
"""
349430
Parameters
@@ -357,17 +438,11 @@ def __init__(self, name, data, distribution, model):
357438
model : Model
358439
"""
359440
self.name = name
360-
data = getattr(data, 'values', data) #handle pandas
361-
args = as_iterargs(data)
362441

363-
if len(args) > 1:
364-
params = getargspec(distribution.logp).args
365-
args = [t.as_tensor_variable(d, name=name + "_" + param)
366-
for d,param in zip(args,params) ]
367-
else:
368-
args = [t.as_tensor_variable(args[0], name=name)]
442+
self.data = { name : as_tensor(data, name, model, distribution.dtype) for name, data in data.items()}
369443

370-
self.logp_elemwiset = distribution.logp(*args)
444+
self.missing_values = [ data.missing_values for data in self.data.values() if data.missing_values is not None]
445+
self.logp_elemwiset = distribution.logp(**self.data)
371446
self.model = model
372447
self.distribution = distribution
373448

@@ -433,8 +508,6 @@ def __init__(self, type=None, owner=None, index=None, name=None, distribution=No
433508
def as_iterargs(data):
434509
if isinstance(data, tuple):
435510
return data
436-
if hasattr(data, 'columns'): # data frames
437-
return [np.asarray(data[c]) for c in data.columns]
438511
else:
439512
return [data]
440513

pymc3/tests/test_missing.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from pymc3 import Model, Normal, Metropolis, MvNormal
2+
from numpy import ma
3+
import numpy
4+
import pandas as pd
5+
6+
def test_missing():
7+
data = ma.masked_values([1,2,-1,4,-1], value=-1)
8+
with Model() as model:
9+
x = Normal('x', 1, 1)
10+
y = Normal('y', x, 1, observed=data)
11+
12+
y_missing, = model.missing_values
13+
assert y_missing.tag.test_value.shape == (2,)
14+
15+
model.logp(model.test_point)
16+
17+
18+
def test_missing_pandas():
19+
data = pd.DataFrame([1,2,numpy.nan,4,numpy.nan])
20+
with Model() as model:
21+
x = Normal('x', 1, 1)
22+
y = Normal('y', x, 1, observed=data)
23+
24+
y_missing, = model.missing_values
25+
assert y_missing.tag.test_value.shape == (2,)
26+
27+
model.logp(model.test_point)

0 commit comments

Comments
 (0)