Merge pull request #743 from pymc-devs/missingmaster2

jsalvatier · jsalvatier · commit 96e03fcbfece · 2015-06-07T15:27:53.000-07:00
Missing values imputation implementation
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
@@ -2,7 +2,7 @@
 import numpy as np
 from ..model import Model
 
-__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Discrete']
+__all__ = ['DensityDist', 'Distribution', 'Continuous', 'Discrete', 'NoDistribution', 'TensorType']
 
 
 class Distribution(object):
@@ -69,6 +69,10 @@ def getattr_value(self, val):
 def TensorType(dtype, shape):
     return t.TensorType(str(dtype), np.atleast_1d(shape) == 1)
 
+class NoDistribution(Distribution):
+    def logp(self, x):
+        return 0
+
 class Discrete(Distribution):
     """Base class for discrete distributions"""
     def __init__(self, shape=(), dtype='int64', defaults=['mode'], *args, **kwargs):
diff --git a/pymc3/examples/arbitrary_stochastic.py b/pymc3/examples/arbitrary_stochastic.py
@@ -9,7 +9,7 @@
     def logp(failure, value):
         return sum(failure * log(lam) - lam * value)
 
-    x = DensityDist('x', logp, observed=(failure, value))
+    x = DensityDist('x', logp, observed={'failure':failure, 'value':value})
 
 
 def run (n=3000):
diff --git a/pymc3/examples/data/test_scores.csv b/pymc3/examples/data/test_scores.csv
@@ -0,0 +1,17 @@
+score,male,sib,synd_or_disab,age_test,mother_hs,ident_by_3
+100,0,1,0,5,1,0
+35,0,2,0,9,0,1
+24,1,NaN,1,7,1,0
+75,1,0,0,2,1,1
+24,1,NaN,1,7,1,0
+81,1,0,0,2,1,1
+24,1,3,1,7,1,0
+64,1,0,NaN,11,1,0
+18,1,1,1,9,1,0
+24,1,1,0,8,0,0
+24,1,1,1,7,1,0
+76,0,2,0,2,NaN,0
+36,0,1,0,2,1,1
+16,1,1,0,2,0,0
+95,0,0,0,2,NaN,1
+65,1,2,0,2,0,1
diff --git a/pymc3/examples/disaster_model.ipynb b/pymc3/examples/disaster_model.ipynb
diff --git a/pymc3/examples/disaster_model_missing.py b/pymc3/examples/disaster_model_missing.py
@@ -0,0 +1,68 @@
+"""
+A model for coal mining disasters data with a changepoint
+
+switchpoint ~ U(0, 110)
+early_mean ~ Exp(1.)
+late_mean ~ Exp(1.)
+disasters[t] ~ Po(early_mean if t <= switchpoint, late_mean otherwise)
+
+"""
+
+from pymc3 import *
+
+import theano.tensor as t
+from numpy import arange, array, ones, concatenate
+from numpy.random import randint
+from numpy.ma import masked_values
+
+__all__ = ['disasters_data', 'switchpoint', 'early_mean', 'late_mean', 'rate',
+             'disasters']
+
+# Time series of recorded coal mining disasters in the UK from 1851 to 1962
+disasters_data = array([4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,
+                            3, 3, 5, 4, 5, 3, 1, -1, 4, 1, 5, 5, 3, 4, 2, 5,
+                            2, 2, 3, 4, 2, 1, 3, 2, 2, 1, 1, 1, 1, 3, 0, 0,
+                            1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,
+                            0, 1, 0, 1, -1, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,
+                            3, 3, 1, 1, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,
+                            0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])
+years = len(disasters_data)
+
+masked_values = masked_values(disasters_data, value=-1)
+
+with Model() as model:
+
+    # Prior for distribution of switchpoint location
+    switchpoint = DiscreteUniform('switchpoint', lower=0, upper=years)
+    # Priors for pre- and post-switch mean number of disasters
+    early_mean = Exponential('early_mean', lam=1.)
+    late_mean = Exponential('late_mean', lam=1.)
+
+    # Allocate appropriate Poisson rates to years before and after current
+    # switchpoint location
+    idx = arange(years)
+    rate = switch(switchpoint >= idx, early_mean, late_mean)
+
+    # Data likelihood
+    disasters = Poisson('disasters', rate, observed=masked_values)
+
+
+def run(n=1000):
+    if n == "short":
+        n = 500
+    with model:
+
+        # Initial values for stochastic nodes
+        start = {'early_mean': 2., 'late_mean': 3.}
+
+        # Use slice sampler for means
+        step1 = Slice([early_mean, late_mean])
+        # Use Metropolis for switchpoint, since it accomodates discrete variables
+        step2 = Metropolis([switchpoint, disasters.missing_values ])
+
+        tr = sample(n, tune=500, start=start, step=[step1, step2])
+        
+        summary(tr, vars=['disasters_missing'])
+
+if __name__ == '__main__':
+    run()
diff --git a/pymc3/examples/lasso_missing.py b/pymc3/examples/lasso_missing.py
@@ -0,0 +1,53 @@
+from pymc3 import *
+import numpy as np
+import pandas as pd
+from numpy.ma import masked_values
+
+# Import data, filling missing values with sentinels (-999)
+test_scores = pd.read_csv(get_data_file('pymc3.examples', 'data/test_scores.csv')).fillna(-999)
+
+# Extract variables: test score, gender, number of siblings, previous disability, age, 
+# mother with HS education or better, hearing loss identified by 3 months of age
+(score, male, siblings, disability, 
+    age, mother_hs, early_ident) = test_scores[['score', 'male', 'sib', 
+                                                'synd_or_disab', 'age_test',
+                                                'mother_hs', 'ident_by_3']].astype(float).values.T
+
+with Model() as model:
+
+    # Impute missing values
+    sib_mean = Exponential('sib_mean', 1)
+    siblings_imp = Poisson('siblings_imp', sib_mean, observed=masked_values(siblings, value=-999))
+
+    p_disab = Beta('p_disab', 1, 1)
+    disability_imp = Bernoulli('disability_imp', p_disab, observed=masked_values(disability, value=-999))
+
+    p_mother = Beta('p_mother', 1, 1)
+    mother_imp = Bernoulli('mother_imp', p_mother, observed=masked_values(mother_hs, value=-999))
+
+    s = HalfCauchy('s', 5, testval=5)
+    beta = Laplace('beta', 0, 100, shape=7, testval=.1)
+
+    expected_score = (beta[0] + beta[1]*male + beta[2]*siblings_imp + beta[3]*disability_imp + 
+        beta[4]*age + beta[5]*mother_imp + beta[6]*early_ident)
+
+    observed_score = Normal('observed_score', expected_score, s ** -2, observed=score)
+
+
+with model:
+    start = find_MAP()
+    step1 = NUTS([beta, s, p_disab, p_mother, sib_mean], scaling=start)
+
+    step2 = Metropolis([mother_imp.missing_values, 
+                        disability_imp.missing_values,
+                        siblings_imp.missing_values])
+
+def run(n=5000):
+    if n == 'short':
+        n = 100
+    with model:
+        trace = sample(n, [step1, step2], start)
+
+
+if __name__ == '__main__':
+    run()
diff --git a/pymc3/model.py b/pymc3/model.py
@@ -90,6 +90,7 @@ def __init__(self):
         self.observed_RVs = []
         self.deterministics = []
         self.potentials = []
+        self.missing_values = []
         self.model = self
 
     @property
@@ -101,7 +102,7 @@ def logpt(self):
 
     @property
     def vars(self):
-        """List of unobserved random variables the model is defined in terms of (which excludes deterministics)."""
+        """List of unobserved random variables used as inputs to the model (which excludes deterministics)."""
         return self.free_RVs
 
     @property
@@ -112,7 +113,7 @@ def basic_RVs(self):
     @property
     def unobserved_RVs(self):
         """List of all random variable, including deterministic ones."""
-        return self.free_RVs + self.deterministics
+        return self.vars + self.deterministics
 
 
     @property
@@ -147,9 +148,19 @@ def Var(self, name, dist, data=None):
                 var = TransformedRV(name=name, distribution=dist, model=self, transform=dist.transform) 
                 self.deterministics.append(var)
                 return var
-        else:
+        elif isinstance(data, dict):
+            var = MultiObservedRV(name=name, data=data, distribution=dist, model=self)
+            self.observed_RVs.append(var)
+            if var.missing_values:
+                self.free_RVs += var.missing_values
+                self.missing_values += var.missing_values
+        else: 
             var = ObservedRV(name=name, data=data, distribution=dist, model=self)
             self.observed_RVs.append(var)
+            if var.missing_values:
+                self.free_RVs.append(var.missing_values)
+                self.missing_values.append(var.missing_values)
+
         self.add_random_variable(var)
         return var
 
@@ -342,8 +353,78 @@ def __init__(self, type=None, owner=None, index=None, name=None, distribution=No
             self.logp_elemwiset = distribution.logp(self)
             self.model = model
 
-class ObservedRV(Factor):
-    """Observed random variable that a model is specified in terms of."""
+def pandas_to_array(data):
+    if hasattr(data, 'values'): #pandas
+        if data.isnull().any().any(): #missing values
+            return np.ma.MaskedArray(data.values, data.isnull().values)
+        else: 
+            return data.values
+    elif hasattr(data, 'mask'):
+        return data
+    elif isinstance(data, theano.gof.graph.Variable):
+        return data 
+    else:
+        return np.asarray(data)
+        
+
+def as_tensor(data, name,model, dtype):
+    data = pandas_to_array(data).astype(dtype)
+
+    if hasattr(data, 'mask'): 
+        from .distributions import NoDistribution
+        fakedist = NoDistribution.dist(shape=data.mask.sum(), dtype=dtype, testval=data.mean().astype(dtype))
+        missing_values = FreeRV(name=name + '_missing', distribution=fakedist, model=model)
+
+        constant = t.as_tensor_variable(data.filled())
+
+        dataTensor = theano.tensor.set_subtensor(constant[data.mask.nonzero()], missing_values) 
+        dataTensor.missing_values = missing_values
+        return dataTensor
+    else:
+        data = t.as_tensor_variable(data, name=name)
+        data.missing_values = None
+        return data
+
+class ObservedRV(Factor, TensorVariable):
+    """Observed random variable that a model is specified in terms of.
+    Potentially partially observed.
+    """
+    def __init__(self, type=None, owner=None, index=None, name=None, data=None, distribution=None, model=None):
+        """
+        Parameters
+        ----------
+
+        type : theano type (optional)
+        owner : theano owner (optional)
+
+        name : str
+        distribution : Distribution
+        model : Model
+        """
+        from .distributions import TensorType
+        if type is None:
+            data = pandas_to_array(data)
+            type = TensorType(distribution.dtype, data.shape)
+
+        super(TensorVariable, self).__init__(type, None, None, name)
+
+        if distribution is not None:
+            data = as_tensor(data, name,model,distribution.dtype) 
+            self.missing_values = data.missing_values
+
+            self.logp_elemwiset = distribution.logp(data)
+            self.model = model
+            self.distribution = distribution
+
+            #make this RV a view on the combined missing/nonmissing array
+            theano.gof.Apply(theano.compile.view_op, inputs=[data], outputs=[self])
+
+            self.tag.test_value = theano.compile.view_op(data).tag.test_value
+
+class MultiObservedRV(Factor):
+    """Observed random variable that a model is specified in terms of.
+    Potentially partially observed.
+    """
     def __init__(self, name, data, distribution, model):
         """
         Parameters
@@ -357,17 +438,11 @@ def __init__(self, name, data, distribution, model):
         model : Model
         """
         self.name = name
-        data = getattr(data, 'values', data) #handle pandas
-        args = as_iterargs(data)
 
-        if len(args) > 1:
-            params = getargspec(distribution.logp).args
-            args = [t.as_tensor_variable(d, name=name + "_" + param)
-                    for d,param in zip(args,params) ]
-        else:
-            args = [t.as_tensor_variable(args[0], name=name)]
+        self.data = { name : as_tensor(data, name, model, distribution.dtype) for name, data in data.items()}
 
-        self.logp_elemwiset = distribution.logp(*args)
+        self.missing_values = [ data.missing_values for data in self.data.values() if data.missing_values is not None]
+        self.logp_elemwiset = distribution.logp(**self.data)
         self.model = model
         self.distribution = distribution
 
@@ -433,8 +508,6 @@ def __init__(self, type=None, owner=None, index=None, name=None, distribution=No
 def as_iterargs(data):
     if isinstance(data, tuple):
         return data
-    if hasattr(data, 'columns'):  # data frames
-        return [np.asarray(data[c]) for c in data.columns]
     else:
         return [data]
 
diff --git a/pymc3/tests/test_missing.py b/pymc3/tests/test_missing.py
@@ -0,0 +1,27 @@
+from pymc3 import Model, Normal, Metropolis, MvNormal
+from numpy import ma
+import numpy
+import pandas as pd
+
+def test_missing():
+    data = ma.masked_values([1,2,-1,4,-1], value=-1)
+    with Model() as model:
+        x = Normal('x', 1, 1)
+        y = Normal('y', x, 1, observed=data)
+
+    y_missing, = model.missing_values 
+    assert y_missing.tag.test_value.shape == (2,)
+
+    model.logp(model.test_point)
+
+
+def test_missing_pandas():
+    data = pd.DataFrame([1,2,numpy.nan,4,numpy.nan])
+    with Model() as model:
+        x = Normal('x', 1, 1)
+        y = Normal('y', x, 1, observed=data)
+
+    y_missing, = model.missing_values 
+    assert y_missing.tag.test_value.shape == (2,)
+
+    model.logp(model.test_point)
diff --git a/pymc3/theanof.py b/pymc3/theanof.py
diff --git a/pymc3/vartypes.py b/pymc3/vartypes.py