CNAM-154 Added support for multiple bucket size and lag counts

danielpes · danielpes · commit 8521f6b52aae · 2016-12-06T13:13:11.000+01:00
CNAM-154 Added parameter for removing including death bucket
diff --git a/src/main/resources/config/filtering-default.conf b/src/main/resources/config/filtering-default.conf
@@ -41,10 +41,11 @@ default = {
 
   }
   mlpp_parameters = {
-    bucket_size = 30  # in days
-    lag_count = 10
+    bucket_size = [30]  # in days
+    lag_count = [10]
     min_timestamp = ${default.dates.study_start}
     max_timestamp = ${default.dates.study_end}
+    include_death_bucket = false
 
     exposures = {
       min_purchases = 1
diff --git a/src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPConfig.scala b/src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPConfig.scala
@@ -22,10 +22,11 @@ object MLPPConfig {
 
   private lazy val conf: Config = FilteringConfig.modelConfig("mlpp_parameters")
 
-  lazy val bucketSize: Int = conf.getInt("bucket_size")
-  lazy val lagCount: Int = conf.getInt("lag_count")
+  lazy val bucketSizes: List[Int] = conf.getIntList("bucket_size").asScala.toList.map(_.toInt)
+  lazy val lagCounts: List[Int] = conf.getIntList("lag_count").asScala.toList.map(_.toInt)
   lazy val minTimestamp: Timestamp = makeTS(conf.getIntList("min_timestamp").asScala.toList)
   lazy val maxTimestamp: Timestamp = makeTS(conf.getIntList("max_timestamp").asScala.toList)
+  lazy val includeDeathBucket: Boolean = conf.getBoolean("include_death_bucket")
 
   lazy val exposureDefinition = MLPPExposureDefinition(
     minPurchases = conf.getInt("exposures.min_purchases"),
diff --git a/src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPMain.scala b/src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPMain.scala
@@ -28,6 +28,7 @@ object MLPPMain extends Main {
     val patients: Dataset[Patient] = flatEvents.map(
       e => Patient(e.patientID, e.gender, e.birthDate, e.deathDate)
     ).distinct
+    // todo: test if filter_lost_patients is true
     val tracklossEvents: Dataset[Event] = TrackLossTransformer.transform(
       Sources(dcir=Some(dcirFlat))
     )
@@ -41,15 +42,21 @@ object MLPPMain extends Main {
 
     val exposures: Dataset[FlatEvent] = MLPPExposuresTransformer.transform(allEvents)
 
-    val mlppParams = MLPPWriter.Params(
-      bucketSize = MLPPConfig.bucketSize,
-      lagCount = MLPPConfig.lagCount,
-      minTimestamp = MLPPConfig.minTimestamp,
-      maxTimestamp = MLPPConfig.maxTimestamp
-    )
-    val mlppWriter = MLPPWriter(mlppParams)
-    val result = MLPPWriter(mlppParams).write(diseaseEvents.union(exposures), outputPath)
-
-    Some(result)
+    val results: List[Dataset[MLPPFeature]] = for {
+      bucketSize <- MLPPConfig.bucketSizes
+      lagCount <- MLPPConfig.lagCounts
+    } yield {
+      val mlppParams = MLPPWriter.Params(
+        bucketSize = bucketSize,
+        lagCount = lagCount,
+        minTimestamp = MLPPConfig.minTimestamp,
+        maxTimestamp = MLPPConfig.maxTimestamp,
+        includeDeathBucket = MLPPConfig.includeDeathBucket
+      )
+      val mlppWriter = MLPPWriter(mlppParams)
+      val path = s"$outputPath/${bucketSize}B-${lagCount}L"
+      MLPPWriter(mlppParams).write(diseaseEvents.union(exposures), path)
+    }
+    Some(results.head)
   }
 }
diff --git a/src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPWriter.scala b/src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPWriter.scala
@@ -17,7 +17,8 @@ object MLPPWriter {
     bucketSize: Int = 30,
     lagCount: Int = 10,
     minTimestamp: Timestamp = makeTS(2006, 1, 1),
-    maxTimestamp: Timestamp = makeTS(2009, 12, 31, 23, 59, 59)
+    maxTimestamp: Timestamp = makeTS(2009, 12, 31, 23, 59, 59),
+    includeDeathBucket: Boolean = false
   )
 
   def apply(params: Params = Params()) = new MLPPWriter(params)
@@ -76,9 +77,9 @@ class MLPPWriter(params: MLPPWriter.Params = MLPPWriter.Params()) {
     // We are no longer using trackloss and disease information for calculating the end bucket.
     def withEndBucket: DataFrame = {
 
-      val endBucket: Column = minColumn(
-        col("deathBucket"), lit(bucketCount)
-      )
+      val deathBucketRule = if (params.includeDeathBucket) col("deathBucket") + 1 else col("deathBucket")
+
+      val endBucket: Column = minColumn(deathBucketRule, lit(bucketCount))
       data.withColumn("endBucket", endBucket)
     }
 
diff --git a/src/test/resources/config/mlpp-default.conf b/src/test/resources/config/mlpp-default.conf
@@ -1,3 +1,3 @@
 # This is needed because otherwise in the current dummy data all patients would be filtered
 mlpp_parameters.exposures.filter_diagnosed_patients = false
-mlpp_parameters.bucket_size = 20 # days
+mlpp_parameters.bucket_size = [20] # days
diff --git a/src/test/resources/config/mlpp-new-exposure.conf b/src/test/resources/config/mlpp-new-exposure.conf
@@ -1,6 +1,6 @@
 # This is needed because otherwise in the current dummy data all patients would be filtered
 mlpp_parameters.exposures.filter_diagnosed_patients = false
-mlpp_parameters.bucket_size = 20 # days
+mlpp_parameters.bucket_size = [20] # days
 
 # Changing exposure definition to a "cox-like" one.
 mlpp_parameters.exposures = {
diff --git a/src/test/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPWriterSuite.scala b/src/test/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPWriterSuite.scala
@@ -193,7 +193,7 @@ class MLPPWriterSuite extends SharedContext {
     assert(result === expected)
   }
 
-  "withEndBucket" should "add a column with the minimum among deathBucket, diseaseBucket and the max number of buckets" in {
+  "withEndBucket" should "add a column with the minimum among deathBucket and the max number of buckets" in {
     val sqlCtx = sqlContext
     import sqlCtx.implicits._
 
@@ -244,6 +244,53 @@ class MLPPWriterSuite extends SharedContext {
     assert(result === expected)
   }
 
+  it should "add a column with the minimum among deathBucket + 1, and the max number of buckets if " +
+      "includeDeathBucket is true" in {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+
+    // Given
+    val params = MLPPWriter.Params(
+      minTimestamp = makeTS(2006, 1, 1),
+      maxTimestamp = makeTS(2006, 2, 2),
+      bucketSize = 2,
+      includeDeathBucket = true
+    )
+
+    val input = Seq(
+      ("PA", Some(16)),
+      ("PA", Some(16)),
+      ("PB", Some( 0)),
+      ("PB", Some( 0)),
+      ("PC", Some( 5)),
+      ("PC", Some( 5)),
+      ("PD",    None),
+      ("PD",    None)
+    ).toDF("patientID", "deathBucket")
+
+    val expected = Seq(
+      ("PA", Some(16)),
+      ("PA", Some(16)),
+      ("PB", Some( 1)),
+      ("PB", Some( 1)),
+      ("PC", Some( 6)),
+      ("PC", Some( 6)),
+      ("PD", Some(16)),
+      ("PD", Some(16))
+    ).toDF("patientID", "endBucket")
+
+    // When
+    val writer = MLPPWriter(params)
+    import writer.MLPPDataFrame
+    val result = input.withEndBucket.select("patientID", "endBucket")
+
+    // Then
+    import RichDataFrames._
+    result.show
+    expected.show
+    assert(result === expected)
+  }
+
   "makeDiscreteExposures" should "return a Dataset containing the 0-lag exposures in the sparse format" in {
     val sqlCtx = sqlContext
     import sqlCtx.implicits._
@@ -592,9 +639,9 @@ class MLPPWriterSuite extends SharedContext {
     )
     val input: Dataset[FlatEvent] = Seq(
       FlatEvent("PC", 2, makeTS(1970, 1, 1), None, "exposure", "Mol1", 1.0, makeTS(2006, 5, 15), None),
-      FlatEvent("PB", 1, makeTS(1950, 1, 1), Some(makeTS(2006, 6, 15)), "exposure", "Mol1", 1.0, makeTS(2006, 1, 15), None),
-      FlatEvent("PB", 1, makeTS(1950, 1, 1), Some(makeTS(2006, 6, 15)), "exposure", "Mol2", 1.0, makeTS(2006, 3, 15), None),
-      FlatEvent("PB", 1, makeTS(1950, 1, 1), Some(makeTS(2006, 6, 15)), "exposure", "Mol2", 1.0, makeTS(2006, 5, 15), None),
+      FlatEvent("PB", 1, makeTS(1950, 1, 1), Some(makeTS(2006, 4, 15)), "exposure", "Mol1", 1.0, makeTS(2006, 1, 15), None),
+      FlatEvent("PB", 1, makeTS(1950, 1, 1), Some(makeTS(2006, 4, 15)), "exposure", "Mol1", 1.0, makeTS(2006, 3, 15), None),
+      FlatEvent("PB", 1, makeTS(1950, 1, 1), Some(makeTS(2006, 4, 15)), "disease", "targetDisease", 1.0, makeTS(2006, 3, 15), None),
       FlatEvent("PA", 1, makeTS(1960, 1, 1), None, "exposure", "Mol1", 1.0, makeTS(2006, 1, 15), None),
       FlatEvent("PA", 1, makeTS(1960, 1, 1), None, "exposure", "Mol1", 1.0, makeTS(2006, 3, 15), None),
       FlatEvent("PA", 1, makeTS(1960, 1, 1), None, "exposure", "Mol1", 1.0, makeTS(2006, 4, 15), None),
@@ -624,12 +671,17 @@ class MLPPWriterSuite extends SharedContext {
       MLPPFeature("PA", 0, "Mol3", 2, 3, 0, 3,  8, 1.0),
       MLPPFeature("PA", 0, "Mol3", 2, 4, 1, 4,  9, 1.0),
       MLPPFeature("PA", 0, "Mol3", 2, 5, 2, 5, 10, 1.0),
-      MLPPFeature("PA", 0, "Mol3", 2, 6, 3, 6, 11, 1.0)
+      MLPPFeature("PA", 0, "Mol3", 2, 6, 3, 6, 11, 1.0),
+      // Patient B
+      MLPPFeature("PB", 1, "Mol1", 0, 0, 0,  7,  0, 1.0),
+      MLPPFeature("PB", 1, "Mol1", 0, 1, 1,  8,  1, 1.0),
+      MLPPFeature("PB", 1, "Mol1", 0, 2, 2,  9,  2, 1.0),
+      MLPPFeature("PB", 1, "Mol1", 0, 2, 0,  9,  0, 1.0)
     ).toDF
 
     val expectedZMatrix = Seq(
       (3D, 1D, 1D, 46, 1, "PA", 0),
-      (1D, 2D, 0D, 56, 1, "PB", 1),
+      (2D, 0D, 0D, 56, 1, "PB", 1),
       (1D, 0D, 0D, 36, 2, "PC", 2)
     ).toDF("MOL0000_Mol1", "MOL0001_Mol2", "MOL0002_Mol3", "age", "gender", "patientID", "patientIDIndex")
 
@@ -640,8 +692,88 @@ class MLPPWriterSuite extends SharedContext {
 
     // Then
     import RichDataFrames._
-    result.show
-    expectedFeatures.show
+    result.show(100)
+    expectedFeatures.show(100)
+    StaticExposures.show
+    expectedZMatrix.show
+    assert(result === expectedFeatures)
+    assert(writtenResult === expectedFeatures)
+    assert(StaticExposures === expectedZMatrix)
+  }
+
+
+  it should "create the final matrices and write them as parquet files (removing death bucket)" in {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+
+    // Given
+    val rootDir = "target/test/output"
+    val params = MLPPWriter.Params(
+      minTimestamp = makeTS(2006, 1, 1),
+      maxTimestamp = makeTS(2006, 8, 1), // 7 total buckets
+      bucketSize = 30,
+      lagCount = 4,
+      includeDeathBucket = true
+    )
+    val input: Dataset[FlatEvent] = Seq(
+      FlatEvent("PC", 2, makeTS(1970, 1, 1), None, "exposure", "Mol1", 1.0, makeTS(2006, 5, 15), None),
+      FlatEvent("PB", 1, makeTS(1950, 1, 1), Some(makeTS(2006, 4, 15)), "exposure", "Mol1", 1.0, makeTS(2006, 1, 15), None),
+      FlatEvent("PB", 1, makeTS(1950, 1, 1), Some(makeTS(2006, 4, 15)), "exposure", "Mol1", 1.0, makeTS(2006, 3, 15), None),
+      FlatEvent("PB", 1, makeTS(1950, 1, 1), Some(makeTS(2006, 4, 15)), "disease", "targetDisease", 1.0, makeTS(2006, 3, 15), None),
+      FlatEvent("PA", 1, makeTS(1960, 1, 1), None, "exposure", "Mol1", 1.0, makeTS(2006, 1, 15), None),
+      FlatEvent("PA", 1, makeTS(1960, 1, 1), None, "exposure", "Mol1", 1.0, makeTS(2006, 3, 15), None),
+      FlatEvent("PA", 1, makeTS(1960, 1, 1), None, "exposure", "Mol1", 1.0, makeTS(2006, 4, 15), None),
+      FlatEvent("PA", 1, makeTS(1960, 1, 1), None, "exposure", "Mol2", 1.0, makeTS(2006, 3, 15), None),
+      FlatEvent("PA", 1, makeTS(1960, 1, 1), None, "exposure", "Mol3", 1.0, makeTS(2006, 4, 15), None),
+      FlatEvent("PA", 1, makeTS(1960, 1, 1), None, "disease", "targetDisease", 1.0, makeTS(2006, 5, 15), None)
+    ).toDS
+
+    val expectedFeatures = Seq(
+      // Patient A
+      MLPPFeature("PA", 0, "Mol1", 0, 0, 0, 0,  0, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 1, 1, 1,  1, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 2, 2, 2,  2, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 3, 3, 3,  3, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 2, 0, 2,  0, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 3, 1, 3,  1, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 4, 2, 4,  2, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 5, 3, 5,  3, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 3, 0, 3,  0, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 4, 1, 4,  1, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 5, 2, 5,  2, 1.0),
+      MLPPFeature("PA", 0, "Mol1", 0, 6, 3, 6,  3, 1.0),
+      MLPPFeature("PA", 0, "Mol2", 1, 2, 0, 2,  4, 1.0),
+      MLPPFeature("PA", 0, "Mol2", 1, 3, 1, 3,  5, 1.0),
+      MLPPFeature("PA", 0, "Mol2", 1, 4, 2, 4,  6, 1.0),
+      MLPPFeature("PA", 0, "Mol2", 1, 5, 3, 5,  7, 1.0),
+      MLPPFeature("PA", 0, "Mol3", 2, 3, 0, 3,  8, 1.0),
+      MLPPFeature("PA", 0, "Mol3", 2, 4, 1, 4,  9, 1.0),
+      MLPPFeature("PA", 0, "Mol3", 2, 5, 2, 5, 10, 1.0),
+      MLPPFeature("PA", 0, "Mol3", 2, 6, 3, 6, 11, 1.0),
+      // Patient A,
+      MLPPFeature("PB", 1, "Mol1", 0, 0, 0,  7,  0, 1.0),
+      MLPPFeature("PB", 1, "Mol1", 0, 1, 1,  8,  1, 1.0),
+      MLPPFeature("PB", 1, "Mol1", 0, 2, 2,  9,  2, 1.0),
+      MLPPFeature("PB", 1, "Mol1", 0, 3, 3, 10,  3, 1.0),
+      MLPPFeature("PB", 1, "Mol1", 0, 2, 0,  9,  0, 1.0),
+      MLPPFeature("PB", 1, "Mol1", 0, 3, 1, 10,  1, 1.0)
+    ).toDF
+
+    val expectedZMatrix = Seq(
+      (3D, 1D, 1D, 46, 1, "PA", 0),
+      (2D, 0D, 0D, 56, 1, "PB", 1),
+      (1D, 0D, 0D, 36, 2, "PC", 2)
+    ).toDF("MOL0000_Mol1", "MOL0001_Mol2", "MOL0002_Mol3", "age", "gender", "patientID", "patientIDIndex")
+
+    // When
+    val result = MLPPWriter(params).write(input, rootDir).toDF
+    val writtenResult = sqlContext.read.parquet(s"$rootDir/parquet/SparseFeatures")
+    val StaticExposures = sqlContext.read.parquet(s"$rootDir/parquet/StaticExposures")
+
+    // Then
+    import RichDataFrames._
+    result.show(100)
+    expectedFeatures.show(100)
     StaticExposures.show
     expectedZMatrix.show
     assert(result === expectedFeatures)