Mixture Models API

mandar2812 · mandar2812 · commit 0152efd3462b · 2017-06-15T11:26:43.000+02:00
- Mixtures for continuous random variables
 - Stochastic Mixture Models top level trait
 - Gaussian Process Mixture model implementation
diff --git a/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/algebra/PartitionedVector.scala b/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/algebra/PartitionedVector.scala
@@ -7,18 +7,25 @@ import org.apache.spark.rdd.RDD
 import scala.collection.immutable.NumericRange
 
 /**
-  * @author mandar2812 date 13/10/2016.
   * A distributed vector that is stored in blocks.
   * @param data The underlying [[RDD]] which should consist of
   *             block indices and a breeze [[DenseVector]] containing
   *             all the elements in the said block.
-  */
+  * @param num_rows Number of elements as a [[Long]], in case not specified
+  *                 is calculated on instance creation.
+  * @param num_row_blocks Number of blocks, in case not specified
+  *                 is calculated on instance creation.
+  * @author mandar2812 date 13/10/2016.
+  *
+  * */
 private[dynaml] class PartitionedVector(data: Stream[(Long, DenseVector[Double])],
                                         num_rows: Long = -1L,
                                         num_row_blocks: Long = -1L)
   extends AbstractPartitionedVector[DenseVector[Double]](data, num_row_blocks)
     with NumericOps[PartitionedVector] {
 
+  self =>
+
   lazy val rows: Long = if(num_rows == -1L) data.map(_._2.length).sum.toLong else num_rows
 
   override lazy val cols: Long = 1L
@@ -36,6 +43,8 @@ private[dynaml] class PartitionedVector(data: Stream[(Long, DenseVector[Double])
   
   def toBreezeVector = DenseVector.vertcat(data.sortBy(_._1).map(_._2):_*)
 
+  def toStream = PartitionedVector.toStream(self)
+
   def reverse: PartitionedVector = map(c => (rowBlocks - 1L - c._1, DenseVector(c._2.toArray.reverse)))
 
 }
@@ -46,7 +55,7 @@ object PartitionedVector {
   /**
     * Create a [[PartitionedVector]] given the input blocks.
     *
-    */
+    * */
   def apply(data: Stream[(Long, DenseVector[Double])], num_rows: Long = -1L): PartitionedVector = {
 
     val nC = if(num_rows == -1L) data.map(_._2.length).sum else num_rows
@@ -58,7 +67,7 @@ object PartitionedVector {
   /**
     * Create a [[PartitionedVector]] from a tabulation function
     *
-    */
+    * */
   def apply(length: Long, numElementsPerBlock: Int, tabFunc: (Long) => Double): PartitionedVector = {
     val num_blocks: Long = math.ceil(length.toDouble/numElementsPerBlock).toLong
     val blockIndices = 0L until num_blocks
@@ -76,7 +85,7 @@ object PartitionedVector {
     * @param length The size of the stream
     * @param num_elements_per_block The size of each block
     * @return A [[PartitionedVector]] instance.
-    */
+    * */
   def apply(d: Stream[Double], length: Long, num_elements_per_block: Int): PartitionedVector = {
     val num_blocks: Long = math.ceil(length.toDouble/num_elements_per_block).toLong
     val data = d.grouped(num_elements_per_block)
@@ -92,7 +101,7 @@ object PartitionedVector {
     * @param v input vector
     * @param num_elements_per_block The size of each block
     * @return A [[PartitionedVector]] instance.
-    */
+    * */
   def apply(v: DenseVector[Double], num_elements_per_block: Int): PartitionedVector = {
     val blocks = v.toArray
       .grouped(num_elements_per_block)
@@ -106,7 +115,7 @@ object PartitionedVector {
 
   /**
     * Vertically merge a number of partitioned vectors.
-    */
+    * */
   def vertcat(vectors: PartitionedVector*): PartitionedVector = {
     //sanity check
     assert(vectors.map(_.colBlocks).distinct.length == 1,
@@ -121,22 +130,28 @@ object PartitionedVector {
 
   /**
     * Populate a partitioned vector with zeros.
-    */
+    * */
   def zeros(numElements: Long, numElementsPerBlock: Int): PartitionedVector =
     PartitionedVector(numElements, numElementsPerBlock, _ => 0.0)
 
   /**
     * Populate a partitioned vector with ones.
-    */
+    * */
   def ones(numElements: Long, numElementsPerBlock: Int): PartitionedVector =
     PartitionedVector(numElements, numElementsPerBlock, _ => 1.0)
 
   /**
     * Populate a partitioned vector with I.I.D samples from a
     * specified [[RandomVariable]]
-    */
+    * */
   def rand(numElements: Long, numElementsPerBlock: Int, r: RandomVariable[Double]): PartitionedVector =
-    PartitionedVector(numElements, numElementsPerBlock, _ => r.sample())
+    PartitionedVector(numElements, numElementsPerBlock, _ => r.draw)
+
+  /**
+    * Convert a [[PartitionedVector]] to a Scala [[Stream]].
+    * */
+  def toStream(pvec: PartitionedVector): Stream[Double] =
+    pvec._data.map(_._2.toArray.toStream).reduceLeft((a, b) => a ++ b)
 
 
 }
diff --git a/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/models/StochasticProcessModel.scala b/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/models/StochasticProcessModel.scala
@@ -20,19 +20,19 @@ package io.github.mandar2812.dynaml.models
 
 import io.github.mandar2812.dynaml.kernels.CovarianceFunction
 import io.github.mandar2812.dynaml.pipes.DataPipe
-import io.github.mandar2812.dynaml.probability.ContinuousRandomVariable
+import io.github.mandar2812.dynaml.probability.{ContinuousMixtureRV, ContinuousRandomVariable}
 import org.apache.log4j.Logger
 
 /**
-  * date 26/08/16.
   * High Level description of a stochastic process based predictive model.
   *
-  * @author mandar2812
   * @tparam T The underlying data structure storing the training & test data.
   * @tparam I The type of the index set (i.e. Double for time series, DenseVector for GP regression)
   * @tparam Y The type of the output label
   * @tparam W Implementing class of the posterior distribution
-  */
+  * @author mandar2812 date 26/08/16.
+  *
+  * */
 trait StochasticProcessModel[T, I, Y, W] extends Model[T, I, Y] {
 
   /** Calculates posterior predictive distribution for
@@ -62,15 +62,14 @@ trait StochasticProcessModel[T, I, Y, W] extends Model[T, I, Y] {
 }
 
 /**
-  * @author mandar2812
-  *
   * Processes which can be specified by upto second order statistics i.e. mean and covariance
   * @tparam T The underlying data structure storing the training & test data.
   * @tparam I The type of the index set (i.e. Double for time series, DenseVector for GP regression)
   * @tparam Y The type of the output label
   * @tparam K The type returned by the kernel function.
   * @tparam M The data structure holding the kernel/covariance matrix
   * @tparam W Implementing class of the posterior distribution
+  * @author mandar2812
   *
   * */
 trait SecondOrderProcessModel[T, I, Y, K, M, W] extends StochasticProcessModel[T, I, Y, W] {
@@ -92,13 +91,13 @@ trait SecondOrderProcessModel[T, I, Y, K, M, W] extends StochasticProcessModel[T
 }
 
 /**
-  * @author mandar2812 date: 11/10/2016
-  *
   * Blueprint for a continuous valued stochastic process, abstracts away the behavior
   * common to sub-classes such as [[io.github.mandar2812.dynaml.models.gp.GPRegression]],
   * [[io.github.mandar2812.dynaml.models.stp.StudentTRegression]] and others.
   *
-  */
+  * @author mandar2812 date: 11/10/2016
+  *
+  * */
 abstract class ContinuousProcessModel[T, I, Y, W <: ContinuousRandomVariable[_]]
   extends StochasticProcessModel[T, I, Y, W] {
 
@@ -138,4 +137,17 @@ abstract class ContinuousProcessModel[T, I, Y, W <: ContinuousRandomVariable[_]]
         i._2._4))
   }
 
-}
+}
+
+/**
+  * A process which is a multinomial mixture of
+  * component processes.
+  * @tparam I The type of the index set (i.e. Double for time series, DenseVector for GP regression)
+  * @tparam Y The type of the output label
+  * @tparam W Implementing class of the posterior distribution,
+  *           should inherit from [[ContinuousMixtureRV]]
+  * @author mandar2812 date 14/06/2017
+  * */
+abstract class StochasticProcessMixtureModel[
+T, I, Y, W <: ContinuousMixtureRV[_, _]] extends
+  ContinuousProcessModel[T, I, Y, W]
diff --git a/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/models/gp/AbstractGPRegressionModel.scala b/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/models/gp/AbstractGPRegressionModel.scala
@@ -74,6 +74,8 @@ abstract class AbstractGPRegressionModel[T, I: ClassTag](
 
   override protected val g: T = data
 
+  def _data = g
+
   val npoints = num
 
   protected var blockSize = 1000
@@ -309,13 +311,12 @@ abstract class AbstractGPRegressionModel[T, I: ClassTag](
     val varD: PartitionedVector = bdiag(postcov)
     val stdDev = varD._data.map(c => (c._1, sqrt(c._2))).map(_._2.toArray.toStream).reduceLeft((a, b) => a ++ b)*/
 
-    val mean = posterior.mu._data.map(_._2.toArray.toStream).reduceLeft((a, b) => a ++ b)
+    val mean = posterior.mu.toStream
 
     val (lower, upper) = posterior.underlyingDist.confidenceInterval(sigma.toDouble)
 
-    val lowerErrorBars = lower._data.map(_._2.toArray.toStream).reduceLeft((a, b) => a++b)
-    val upperErrorBars = upper._data.map(_._2.toArray.toStream).reduceLeft((a, b) => a++b)
-
+    val lowerErrorBars = lower.toStream
+    val upperErrorBars = upper.toStream
 
     logger.info("Generating error bars")
     //val preds = (mean zip stdDev).map(j => (j._1, j._1 - sigma*j._2, j._1 + sigma*j._2))
diff --git a/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/models/gp/GaussianProcessMixture.scala b/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/models/gp/GaussianProcessMixture.scala
@@ -0,0 +1,102 @@
+package io.github.mandar2812.dynaml.models.gp
+
+import breeze.linalg.DenseVector
+import io.github.mandar2812.dynaml.algebra.PartitionedVector
+import io.github.mandar2812.dynaml.models.StochasticProcessMixtureModel
+import io.github.mandar2812.dynaml.probability.{ContinuousDistrMixture, MultGaussianPRV}
+import io.github.mandar2812.dynaml.algebra.PartitionedMatrixOps._
+import org.apache.log4j.Logger
+
+import scala.reflect.ClassTag
+
+/**
+  *
+  * Represents a multinomial mixture of GP models
+  * @tparam I The index set (input domain) over which each component GP is
+  *           defined.
+  *
+  * @author mandar2812 date 14/06/2017.
+  * */
+class GaussianProcessMixture[I: ClassTag](
+  val component_processes: Seq[AbstractGPRegressionModel[_, I]],
+  val weights: DenseVector[Double]) extends
+  StochasticProcessMixtureModel[
+    Seq[(I, Double)], I, Double,
+    ContinuousDistrMixture[
+      PartitionedVector,
+      MultGaussianPRV]] {
+
+  private val logger = Logger.getLogger(this.getClass)
+
+  /**
+    *
+    * Calculates posterior predictive distribution for
+    * a particular set of test data points.
+    *
+    * @param test A Sequence or Sequence like data structure
+    *             storing the values of the input patters.
+    * */
+  override def predictiveDistribution[U <: Seq[I]](test: U) =
+    ContinuousDistrMixture[PartitionedVector, MultGaussianPRV](
+      component_processes.map(_.predictiveDistribution(test)),
+      weights)
+
+
+  /**
+    * The training data
+    * */
+  override protected val g: Seq[(I, Double)] = Seq()
+
+  /**
+    * Draw three predictions from the posterior predictive distribution
+    * 1) Mean or MAP estimate Y
+    * 2) Y- : The lower error bar estimate (mean - sigma*stdDeviation)
+    * 3) Y+ : The upper error bar. (mean + sigma*stdDeviation)
+    * */
+  override def predictionWithErrorBars[U <: Seq[I]](testData: U, sigma: Int) = {
+
+    val posterior_components = component_processes.map(_.predictiveDistribution(testData))
+
+    val post_means = posterior_components.map(_.mu)
+
+    //._data.map(_._2.toArray.toStream).reduceLeft((a, b) => a ++ b)
+
+    val error_bars_components = posterior_components.map(
+      _.underlyingDist.confidenceInterval(sigma.toDouble)
+    )
+
+    val weightsArr = weights.toArray
+
+    val mean = post_means.zip(weightsArr).map(c => c._1*c._2).reduce((a, b) => a+b).toStream
+
+    val combined_error_bars_vec = error_bars_components.zip(weightsArr)
+      .map(c => (c._1._1*c._2,c._1._2*c._2))
+      .reduce((a,b) => (a._1+b._1, a._2+b._2))
+
+    val (lowerErrorBars, upperErrorBars) = (
+      combined_error_bars_vec._1.toStream,
+      combined_error_bars_vec._2.toStream)
+
+
+    logger.info("Generating error bars")
+    //val preds = (mean zip stdDev).map(j => (j._1, j._1 - sigma*j._2, j._1 + sigma*j._2))
+    val preds = mean.zip(lowerErrorBars.zip(upperErrorBars)).map(t => (t._1, t._2._1, t._2._2))
+    (testData zip preds).map(i => (i._1, i._2._1, i._2._2, i._2._3))
+  }
+
+
+  /**
+    * Convert from the underlying data structure to
+    * Seq[(I, Y)] where I is the index set of the GP
+    * and Y is the value/label type.
+    * */
+  override def dataAsSeq(data: Seq[(I, Double)]) = data
+
+  /**
+    * Predict the value of the
+    * target variable given a
+    * point.
+    *
+    * */
+  override def predict(point: I) = predictionWithErrorBars(Seq(point), 1).head._2
+}
diff --git a/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/models/stp/AbstractSTPRegressionModel.scala b/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/models/stp/AbstractSTPRegressionModel.scala
@@ -223,12 +223,12 @@ abstract class AbstractSTPRegressionModel[T, I](
     val varD: PartitionedVector = bdiag(postcov)
     val stdDev = varD._data.map(c => (c._1, sqrt(c._2))).map(_._2.toArray.toStream).reduceLeft((a, b) => a ++ b)*/
 
-    val mean = posterior.mean._data.map(_._2.toArray.toStream).reduceLeft((a, b) => a ++ b)
+    val mean = posterior.mean.toStream
 
     val (lower, upper) = posterior.underlyingDist.confidenceInterval(sigma.toDouble)
 
-    val lowerErrorBars = lower._data.map(_._2.toArray.toStream).reduceLeft((a, b) => a++b)
-    val upperErrorBars = upper._data.map(_._2.toArray.toStream).reduceLeft((a, b) => a++b)
+    val lowerErrorBars = lower.toStream
+    val upperErrorBars = upper.toStream
 
 
     logger.info("Generating error bars")
diff --git a/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/probability/MixtureRV.scala b/dynaml-core/src/main/scala-2.11/io/github/mandar2812/dynaml/probability/MixtureRV.scala
@@ -36,18 +36,26 @@ object MixtureRV {
 
 }
 
+/**
+  * A random variable mixture over a continuous domain
+  * @author mandar2812 date 14/06/2017
+  * */
 trait ContinuousMixtureRV[Domain, BaseRV <: ContinuousRandomVariable[Domain]] extends
   ContinuousRandomVariable[Domain] with
   MixtureRV[Domain, BaseRV]
 
-
+/**
+  * A random variable mixture over a continuous domain,
+  * having a computable probability distribution
+  * @author mandar2812 date 14/06/2017
+  * */
 class ContinuousDistrMixture[
 Domain, BaseRV <: ContinuousDistrRV[Domain]](
   distributions: Seq[BaseRV],
   selector: MultinomialRV) extends
   ContinuousMixtureRV[Domain, BaseRV] with
   ContinuousDistrRV[Domain] {
-  
+
   override val mixture_selector = selector
 
   override val components = distributions
@@ -57,3 +65,10 @@ Domain, BaseRV <: ContinuousDistrRV[Domain]](
     selector.underlyingDist.params)
 }
 
+object ContinuousDistrMixture {
+
+  def apply[Domain, BaseRV <: ContinuousDistrRV[Domain]](
+    distributions: Seq[BaseRV],
+    selector: DenseVector[Double]): ContinuousDistrMixture[Domain, BaseRV] =
+    new ContinuousDistrMixture(distributions, MultinomialRV(selector))
+}