STS: update one_step_predictive and impute_missing_values to expose per-timestep log probabilities.

davmre · tensorflower-gardener · commit 878f0969c2e9 · 2021-07-13T19:41:33.000-07:00
This is strictly more flexible than the existing behavior, and supports the use of numerical root search to find quantiles of the predictive distribution. It also matches the behavior of the predictive distributions constructed by the Gibbs sampling code at `tfp.experimental.sts_gibbs.gibbs_sampler.one_step_predictive`.

PiperOrigin-RevId: 384605127
diff --git a/tensorflow_probability/python/sts/forecast.py b/tensorflow_probability/python/sts/forecast.py
@@ -25,6 +25,8 @@
 from tensorflow_probability.python.internal import distribution_util as dist_util
 from tensorflow_probability.python.sts.internal import util as sts_util
 
+from tensorflow.python.util import deprecation  # pylint: disable=g-direct-tensorflow-import
+
 
 def _prefer_static_event_ndims(distribution):
   if distribution.event_shape.ndims is not None:
@@ -33,7 +35,18 @@ def _prefer_static_event_ndims(distribution):
     return tf.size(distribution.event_shape_tensor())
 
 
-def one_step_predictive(model, observed_time_series, parameter_samples):
+@deprecation.deprecated_arg_values(
+    '2021-12-31',
+    '`Predictive distributions returned by`tfp.sts.one_step_predictive` will '
+    'soon compute per-timestep probabilities (treating timesteps as part of '
+    'the batch shape) instead of a single probability for an entire series '
+    '(the current approach, in which timesteps are treated as event shape). '
+    'Please update your code to pass `timesteps_are_event_shape=False` (this '
+    'will soon be the default) and to explicitly sum over the per-timestep log '
+    'probabilities if this is required.',
+    timesteps_are_event_shape=True)
+def one_step_predictive(model, observed_time_series, parameter_samples,
+                        timesteps_are_event_shape=True):
   """Compute one-step-ahead predictive distributions for all timesteps.
 
   Given samples from the posterior over parameters, return the predictive
@@ -55,11 +68,16 @@ def one_step_predictive(model, observed_time_series, parameter_samples):
       param.prior.batch_shape, param.prior.event_shape]) for param in
       model.parameters]`. This may optionally also be a map (Python `dict`) of
       parameter names to `Tensor` values.
+    timesteps_are_event_shape: Deprecated, for backwards compatibility only.
+      If `False`, the predictive distribution will return per-timestep
+      probabilities
+      Default value: `True`.
 
   Returns:
-    forecast_dist: a `tfd.MixtureSameFamily` instance with event shape
-      [num_timesteps] and
-      batch shape `concat([sample_shape, model.batch_shape])`, with
+    predictive_dist: a `tfd.MixtureSameFamily` instance with event shape
+      `[num_timesteps] if timesteps_are_event_shape else []` and
+      batch shape `concat([sample_shape, model.batch_shape,
+      [] if timesteps_are_event_shape else [num_timesteps])`, with
       `num_posterior_draws` mixture components. The `t`th step represents the
       forecast distribution `p(observed_time_series[t] |
       observed_time_series[0:t-1], parameter_samples)`.
@@ -168,9 +186,13 @@ def plot_one_step_predictive(observed_time_series,
 
     # Squeeze dims to convert from LGSSM's event shape `[num_timesteps, 1]`
     # to a scalar time series.
-    return sts_util.mix_over_posterior_draws(
+    predictive_dist = sts_util.mix_over_posterior_draws(
         means=observation_means[..., 0],
         variances=observation_covs[..., 0, 0])
+    if timesteps_are_event_shape:
+      predictive_dist = tfd.Independent(
+          predictive_dist, reinterpreted_batch_ndims=1)
+    return predictive_dist
 
 
 def forecast(model,
@@ -383,10 +405,21 @@ def plot_forecast(observed_time_series,
         components_distribution=forecast_ssm)
 
 
+@deprecation.deprecated_arg_values(
+    '2021-12-31',
+    '`Imputed distributions returned by`tfp.sts.impute_missing_values` will '
+    'soon compute per-timestep probabilities (treating timesteps as part of '
+    'the batch shape) instead of a single probability for an entire series '
+    '(the current approach, in which timesteps are treated as event shape). '
+    'Please update your code to pass `timesteps_are_event_shape=False` (this '
+    'will soon be the default) and to explicitly sum over the per-timestep log '
+    'probabilities if this is required.',
+    timesteps_are_event_shape=True)
 def impute_missing_values(model,
                           observed_time_series,
                           parameter_samples,
-                          include_observation_noise=False):
+                          include_observation_noise=False,
+                          timesteps_are_event_shape=True):
   """Runs posterior inference to impute the missing values in a time series.
 
   This method computes the posterior marginals `p(latent state | observations)`,
@@ -417,11 +450,17 @@ def impute_missing_values(model,
       values that could be *observed* at each timestep, including any i.i.d.
       observation noise.
       Default value: `False`.
+    timesteps_are_event_shape: Deprecated, for backwards compatibility only.
+      If `False`, the predictive distribution will return per-timestep
+      probabilities
+      Default value: `True`.
 
   Returns:
     imputed_series_dist: a `tfd.MixtureSameFamily` instance with event shape
-      [num_timesteps] and batch shape `concat([sample_shape,
-      model.batch_shape])`, with `num_posterior_draws` mixture components.
+      `[num_timesteps] if timesteps_are_event_shape else []` and
+      batch shape `concat([sample_shape, model.batch_shape,
+      [] if timesteps_are_event_shape else [num_timesteps])`, with
+      `num_posterior_draws` mixture components.
 
   #### Example
 
@@ -497,6 +536,10 @@ def impute_missing_values(model,
 
     # Squeeze dims to convert from LGSSM's event shape `[num_timesteps, 1]`
     # to a scalar time series.
-    return sts_util.mix_over_posterior_draws(
+    imputed_values_dist = sts_util.mix_over_posterior_draws(
         means=observation_means[..., 0],
         variances=observation_covs[..., 0, 0])
+    if timesteps_are_event_shape:
+      imputed_values_dist = tfd.Independent(
+          imputed_values_dist, reinterpreted_batch_ndims=1)
+    return imputed_values_dist
diff --git a/tensorflow_probability/python/sts/forecast_test.py b/tensorflow_probability/python/sts/forecast_test.py
@@ -56,9 +56,9 @@ def test_one_step_predictive_correctness(self):
                   [observation_noise_scale])}
 
     onestep_dist = tfp.sts.one_step_predictive(model, observed_time_series,
+                                               timesteps_are_event_shape=False,
                                                parameter_samples=params)
-    onestep_mean_, onestep_scale_ = self.evaluate(
-        (onestep_dist.mean(), onestep_dist.stddev()))
+    onestep_mean, onestep_scale = onestep_dist.mean(), onestep_dist.stddev()
 
     # Since Seasonal is just a set of interleaved random walks, it's
     # straightforward to compute the forecast analytically.
@@ -80,8 +80,8 @@ def test_one_step_predictive_correctness(self):
     expected_onestep_scale = np.concatenate([
         [np.sqrt(1.**2 + observation_noise_scale**2)] * 4,
         [np.sqrt(observation_predictive_variance)] * 4])
-    self.assertAllClose(onestep_mean_, expected_onestep_mean)
-    self.assertAllClose(onestep_scale_, expected_onestep_scale)
+    self.assertAllClose(onestep_mean, expected_onestep_mean)
+    self.assertAllClose(onestep_scale, expected_onestep_scale)
 
   def test_one_step_predictive_with_batch_shape(self):
     num_param_samples = 5
@@ -95,16 +95,16 @@ def test_one_step_predictive_with_batch_shape(self):
                      for param in model.parameters]
 
     onestep_dist = tfp.sts.one_step_predictive(model, observed_time_series,
+                                               timesteps_are_event_shape=False,
                                                parameter_samples=prior_samples)
 
     self.evaluate(tf1.global_variables_initializer())
-    if self.use_static_shape:
-      self.assertAllEqual(onestep_dist.batch_shape.as_list(), batch_shape)
-    else:
-      self.assertAllEqual(self.evaluate(onestep_dist.batch_shape_tensor()),
-                          batch_shape)
-    onestep_mean_ = self.evaluate(onestep_dist.mean())
-    self.assertAllEqual(onestep_mean_.shape, batch_shape + [num_timesteps])
+    self.assertAllEqual(onestep_dist.batch_shape_tensor(),
+                        batch_shape + [num_timesteps])
+    onestep_mean = onestep_dist.mean()
+    self.assertAllEqual(tf.shape(onestep_mean), batch_shape + [num_timesteps])
+    self.assertAllEqual(tf.shape(onestep_dist.log_prob(onestep_mean)),
+                        batch_shape + [num_timesteps])
 
   def test_forecast_correctness(self):
     observed_time_series_ = np.array([1., -1., -3., 4.])
@@ -125,8 +125,6 @@ def test_forecast_correctness(self):
                                      include_observation_noise=True)
     forecast_mean = forecast_dist.mean()[..., 0]
     forecast_scale = forecast_dist.stddev()[..., 0]
-    forecast_mean_, forecast_scale_ = self.evaluate(
-        (forecast_mean, forecast_scale))
 
     # Since Seasonal is just a set of interleaved random walks, it's
     # straightforward to compute the forecast analytically.
@@ -143,8 +141,8 @@ def test_forecast_correctness(self):
     expected_forecast_scale = np.concatenate([
         [np.sqrt(observation_predictive_variance)] * 4,
         [np.sqrt(observation_predictive_variance + drift_scale**2)] * 4])
-    self.assertAllClose(forecast_mean_, expected_forecast_mean)
-    self.assertAllClose(forecast_scale_, expected_forecast_scale)
+    self.assertAllClose(forecast_mean, expected_forecast_mean)
+    self.assertAllClose(forecast_scale, expected_forecast_scale)
 
     # Also test forecasting the noise-free function.
     forecast_dist = tfp.sts.forecast(model, observed_time_series,
@@ -153,15 +151,13 @@ def test_forecast_correctness(self):
                                      include_observation_noise=False)
     forecast_mean = forecast_dist.mean()[..., 0]
     forecast_scale = forecast_dist.stddev()[..., 0]
-    forecast_mean_, forecast_scale_ = self.evaluate(
-        (forecast_mean, forecast_scale))
 
     noiseless_predictive_variance = (effect_posterior_variance + drift_scale**2)
     expected_forecast_scale = np.concatenate([
         [np.sqrt(noiseless_predictive_variance)] * 4,
         [np.sqrt(noiseless_predictive_variance + drift_scale**2)] * 4])
-    self.assertAllClose(forecast_mean_, expected_forecast_mean)
-    self.assertAllClose(forecast_scale_, expected_forecast_scale)
+    self.assertAllClose(forecast_mean, expected_forecast_mean)
+    self.assertAllClose(forecast_scale, expected_forecast_scale)
 
   def test_forecast_from_hmc(self):
     # test that we can directly plug in the output of an HMC chain as
@@ -220,15 +216,9 @@ def test_forecast_with_batch_shape(self):
                                      num_steps_forecast=num_steps_forecast)
 
     self.evaluate(tf1.global_variables_initializer())
-    if self.use_static_shape:
-      self.assertAllEqual(forecast_dist.batch_shape.as_list(), batch_shape)
-    else:
-      self.assertAllEqual(self.evaluate(forecast_dist.batch_shape_tensor()),
-                          batch_shape)
-    forecast_mean = forecast_dist.mean()[..., 0]
-    forecast_mean_ = self.evaluate(forecast_mean)
-    self.assertAllEqual(forecast_mean_.shape,
-                        batch_shape + [num_steps_forecast])
+    self.assertAllEqual(forecast_dist.batch_shape_tensor(), batch_shape)
+    self.assertAllEqual(tf.shape(forecast_dist.mean()),
+                        batch_shape + [num_steps_forecast, 1])
 
   def test_methods_handle_masked_inputs(self):
     num_param_samples = 5
@@ -268,6 +258,7 @@ def test_methods_handle_masked_inputs(self):
     self.assertTrue(np.all(np.isfinite(onestep_stddev_)))
 
   def test_impute_missing(self):
+    num_timesteps = 7
     time_series_with_nans = self._build_tensor(
         [-1., 1., np.nan, 2.4, np.nan, np.nan, 2.])
     observed_time_series = tfp.sts.MaskedTimeSeries(
@@ -288,19 +279,21 @@ def test_impute_missing(self):
     parameter_samples = {'observation_noise_scale': [noise_scale],
                          'seasonal/_drift_scale': [drift_scale]}
     imputed_series_dist = tfp.sts.impute_missing_values(
-        model, observed_time_series, parameter_samples)
+        model, observed_time_series, parameter_samples,
+        timesteps_are_event_shape=False)
     imputed_noisy_series_dist = tfp.sts.impute_missing_values(
         model, observed_time_series, parameter_samples,
+        timesteps_are_event_shape=False,
         include_observation_noise=True)
+    self.assertAllEqual(imputed_noisy_series_dist.batch_shape_tensor(),
+                        [num_timesteps])
 
     # Compare imputed mean to expected mean.
-    mean_, stddev_ = self.evaluate([imputed_series_dist.mean(),
-                                    imputed_series_dist.stddev()])
-    noisy_mean_, noisy_stddev_ = self.evaluate([
-        imputed_noisy_series_dist.mean(),
-        imputed_noisy_series_dist.stddev()])
-    self.assertAllClose(mean_, [-1., 1., 2., 2.4, -1., 1., 2.], atol=1e-2)
-    self.assertAllClose(mean_, noisy_mean_, atol=1e-2)
+    mean, stddev = imputed_series_dist.mean(), imputed_series_dist.stddev()
+    noisy_mean, noisy_stddev = [imputed_noisy_series_dist.mean(),
+                                imputed_noisy_series_dist.stddev()]
+    self.assertAllClose(mean, [-1., 1., 2., 2.4, -1., 1., 2.], atol=1e-2)
+    self.assertAllClose(mean, noisy_mean, atol=1e-2)
 
     # Compare imputed stddevs to expected stddevs.
     drift_plus_noise_scale = np.sqrt(noise_scale**2 + drift_scale**2)
@@ -311,9 +304,9 @@ def test_impute_missing(self):
                                 drift_plus_noise_scale,
                                 drift_plus_noise_scale,
                                 noise_scale])
-    self.assertAllClose(stddev_, expected_stddev, atol=1e-2)
-    self.assertAllClose(noisy_stddev_,
-                        np.sqrt(stddev_**2 + noise_scale**2), atol=1e-2)
+    self.assertAllClose(stddev, expected_stddev, atol=1e-2)
+    self.assertAllClose(noisy_stddev,
+                        tf.sqrt(stddev**2 + noise_scale**2), atol=1e-2)
 
   def _build_tensor(self, ndarray, dtype=None):
     """Convert a numpy array to a TF placeholder.
diff --git a/tensorflow_probability/python/sts/internal/util.py b/tensorflow_probability/python/sts/internal/util.py
@@ -376,7 +376,7 @@ def mix_over_posterior_draws(means, variances):
   Returns:
     mixture_dist: `tfd.MixtureSameFamily(tfd.Independent(tfd.Normal))` instance
       representing a uniform mixture over the posterior samples, with
-      `batch_shape = ...` and `event_shape = [num_timesteps]`.
+      `batch_shape = [..., num_timesteps]` and `event_shape = []`.
 
   """
   # The inputs `means`, `variances` have shape
@@ -387,19 +387,16 @@ def mix_over_posterior_draws(means, variances):
   #      [num_timesteps]])`
   # Because MixtureSameFamily mixes over the rightmost batch dimension,
   # we need to move the `num_posterior_draws` dimension to be rightmost
-  # in the batch shape. This requires use of `Independent` (to preserve
-  # `num_timesteps` as part of the event shape) and `move_dimension`.
+  # in the batch shape.
   # TODO(b/120245392): enhance `MixtureSameFamily` to reduce along an
   # arbitrary axis, and eliminate `move_dimension` calls here.
 
   with tf.name_scope('mix_over_posterior_draws'):
     num_posterior_draws = ps.shape(means)[0]
 
-    component_observations = tfd.Independent(
-        distribution=tfd.Normal(
-            loc=dist_util.move_dimension(means, 0, -2),
-            scale=tf.sqrt(dist_util.move_dimension(variances, 0, -2))),
-        reinterpreted_batch_ndims=1)
+    component_observations = tfd.Normal(
+        loc=dist_util.move_dimension(means, 0, -1),
+        scale=tf.sqrt(dist_util.move_dimension(variances, 0, -1)))
 
     return tfd.MixtureSameFamily(
         mixture_distribution=tfd.Categorical(