CNAM-143 Fixes for running at CNAM

danielpes · danielpes · commit b7469a1196ff · 2016-11-15T13:12:31.000+01:00
diff --git a/src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPProvisoryMain.scala b/src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPProvisoryMain.scala
@@ -0,0 +1,38 @@
+package fr.polytechnique.cmap.cnam.filtering.mlpp
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.functions._
+import com.typesafe.config.{Config, ConfigFactory}
+import fr.polytechnique.cmap.cnam.Main
+import fr.polytechnique.cmap.cnam.filtering.FlatEvent
+
+// Used to run the code @CNAM on 09/11/2016 (Donald Trump's election day)
+object MLPPProvisoryMain extends Main {
+
+  override def appName: String = "MLPPMain"
+
+  def runMLPPFeaturing(sqlContext: SQLContext, config: Config) = {
+    import sqlContext.implicits._
+
+    Seq("broad", "narrow").foreach { i =>
+      val coxPatients = sqlContext.read.parquet(s"/shared/burq/filtered_data/$i/cox").select("patientID").distinct
+
+      val flatEventsDF = sqlContext.read.parquet(s"/shared/burq/filtered_data/$i/events")
+        .where(col("category").isin("trackloss", "disease", "molecule"))
+        .join(coxPatients, "patientID")
+        .withColumn("category", when(col("category") === "molecule", lit("exposure")).otherwise(col("category")))
+
+      val flatEvents = flatEventsDF.as[FlatEvent].persist
+
+      MLPPWriter().write(flatEvents, s"/shared/mlpp_features/$i/")
+    }
+  }
+
+  override def main(args: Array[String]): Unit = {
+    startContext()
+    val environment = if (args.nonEmpty) args(0) else "test"
+    val config: Config = ConfigFactory.parseResources("filtering.conf").getConfig(environment)
+    runMLPPFeaturing(sqlContext, config)
+    stopContext()
+  }
+}
diff --git a/src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPWriter.scala b/src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPWriter.scala
@@ -50,12 +50,23 @@ class MLPPWriter(params: MLPPWriter.Params = MLPPWriter.Params()) {
       )
     }
 
+    def withTracklossBucket: DataFrame = {
+      val window = Window.partitionBy("patientId")
+
+      val hadTrackloss: Column = (col("category") === "trackloss") &&
+        (col("startBucket") < minColumn(col("deathBucket"), lit(bucketCount)))
+
+      val tracklossBucket: Column = min(when(hadTrackloss, col("startBucket"))).over(window)
+
+      data.withColumn("tracklossBucket", tracklossBucket)
+    }
+
     def withDiseaseBucket: DataFrame = {
       val window = Window.partitionBy("patientId")
 
       val hadDisease: Column = (col("category") === "disease") &&
-        (col("eventId") === "targetDisease") &&
-        (col("startBucket") < minColumn(col("deathBucket"), lit(bucketCount)))
+      (col("eventId") === "targetDisease") &&
+      (col("startBucket") < minColumn(col("tracklossBucket"), col("deathBucket"), lit(bucketCount)))
 
       val diseaseBucket: Column = min(when(hadDisease, col("startBucket"))).over(window)
 
@@ -65,7 +76,7 @@ class MLPPWriter(params: MLPPWriter.Params = MLPPWriter.Params()) {
     def withEndBucket: DataFrame = {
 
       val endBucket: Column = minColumn(
-        col("diseaseBucket"), col("deathBucket"), lit(bucketCount)
+        col("tracklossBucket"), col("diseaseBucket"), col("deathBucket"), lit(bucketCount)
       )
       data.withColumn("endBucket", endBucket)
     }
@@ -257,6 +268,7 @@ class MLPPWriter(params: MLPPWriter.Params = MLPPWriter.Params()) {
       .withAge(AgeReferenceDate)
       .withStartBucket
       .withDeathBucket
+      .withTracklossBucket
       .withDiseaseBucket
       .withEndBucket
       .where(col("category") === "exposure")
diff --git a/src/main/scala/fr/polytechnique/cmap/cnam/utilities/ColumnUtilities.scala b/src/main/scala/fr/polytechnique/cmap/cnam/utilities/ColumnUtilities.scala
@@ -41,8 +41,8 @@ object ColumnUtilities {
       val lastBucket = if (bucketCount > 0) bucketCount - 1 else 0
 
       val bucketId: Column = floor(datediff(column, lit(minTimestamp)) / lengthDays).cast(IntegerType)
-      when(bucketId <= lastBucket || bucketId.isNull, bucketId)
-        .otherwise(lastBucket)
+      when(bucketId.isNull || bucketId.between(0, lastBucket), bucketId)
+        //.otherwise(lastBucket)
     }
   }
 }
diff --git a/src/test/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPWriterSuite.scala b/src/test/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPWriterSuite.scala
@@ -111,34 +111,75 @@ class MLPPWriterSuite extends SharedContext {
     assert(result === expected)
   }
 
-  "withDiseaseBucket" should "add a column with the timeBucket of the first targetDisease of each patient" in {
+  "withTracklossBucket" should "add a column with the timeBucket of the first trackloss of each patient" in {
     val sqlCtx = sqlContext
     import sqlCtx.implicits._
 
     // Given
     val input = Seq(
       ("PA", "molecule", "PIOGLITAZONE", 0, Some(4)),
       ("PA", "molecule", "PIOGLITAZONE", 5, Some(4)),
-      ("PA", "disease", "targetDisease", 3, Some(4)),
+      ("PA", "trackloss", "trackloss",   3, Some(4)),
       ("PB", "molecule", "PIOGLITAZONE", 2,    None),
-      ("PB", "disease", "targetDisease", 4,    None),
+      ("PB", "trackloss", "trackloss",   4,    None),
       ("PC", "molecule", "PIOGLITAZONE", 0, Some(6)),
       ("PD", "molecule", "PIOGLITAZONE", 2, Some(3)),
       ("PD", "molecule", "PIOGLITAZONE", 3, Some(3)),
-      ("PD", "disease", "targetDisease", 4, Some(3))
+      ("PD", "trackloss", "trackloss",   4, Some(3))
     ).toDF("patientID", "category", "eventId", "startBucket", "deathBucket")
 
     val expected = Seq(
       ("PA", "molecule", "PIOGLITAZONE", 0, Some(4), Some(3)),
       ("PA", "molecule", "PIOGLITAZONE", 5, Some(4), Some(3)),
-      ("PA", "disease", "targetDisease", 3, Some(4), Some(3)),
+      ("PA", "trackloss", "trackloss",   3, Some(4), Some(3)),
       ("PB", "molecule", "PIOGLITAZONE", 2,    None, Some(4)),
-      ("PB", "disease", "targetDisease", 4,    None, Some(4)),
+      ("PB", "trackloss", "trackloss",   4,    None, Some(4)),
+      ("PC", "molecule", "PIOGLITAZONE", 0, Some(6),    None),
+      ("PD", "molecule", "PIOGLITAZONE", 2, Some(3),    None),
+      ("PD", "molecule", "PIOGLITAZONE", 3, Some(3),    None),
+      ("PD", "trackloss", "trackloss",   4, Some(3),    None)
+    ).toDF("patientID", "category", "eventId", "startBucket", "deathBucket", "tracklossBucket")
+
+    // When
+    val writer = MLPPWriter()
+    import writer.MLPPDataFrame
+    val result = input.withTracklossBucket
+
+    // Then
+    import RichDataFrames._
+    result.show
+    expected.show
+    assert(result === expected)
+  }
+
+  "withDiseaseBucket" should "add a column with the timeBucket of the first targetDisease of each patient" in {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+
+    // Given
+    val input = Seq(
+      ("PA", "molecule", "PIOGLITAZONE", 0, Some(4),    None),
+      ("PA", "molecule", "PIOGLITAZONE", 5, Some(4),    None),
+      ("PA", "disease", "targetDisease", 3, Some(4),    None),
+      ("PB", "molecule", "PIOGLITAZONE", 2,    None, Some(5)),
+      ("PB", "disease", "targetDisease", 4,    None, Some(5)),
       ("PC", "molecule", "PIOGLITAZONE", 0, Some(6),    None),
       ("PD", "molecule", "PIOGLITAZONE", 2, Some(3),    None),
       ("PD", "molecule", "PIOGLITAZONE", 3, Some(3),    None),
       ("PD", "disease", "targetDisease", 4, Some(3),    None)
-    ).toDF("patientID", "category", "eventId", "startBucket", "deathBucket", "diseaseBucket")
+    ).toDF("patientID", "category", "eventId", "startBucket", "deathBucket", "tracklossBucket")
+
+    val expected = Seq(
+      ("PA", "molecule", "PIOGLITAZONE", 0, Some(4),    None, Some(3)),
+      ("PA", "molecule", "PIOGLITAZONE", 5, Some(4),    None, Some(3)),
+      ("PA", "disease", "targetDisease", 3, Some(4),    None, Some(3)),
+      ("PB", "molecule", "PIOGLITAZONE", 2,    None, Some(5), Some(4)),
+      ("PB", "disease", "targetDisease", 4,    None, Some(5), Some(4)),
+      ("PC", "molecule", "PIOGLITAZONE", 0, Some(6),    None,    None),
+      ("PD", "molecule", "PIOGLITAZONE", 2, Some(3),    None,    None),
+      ("PD", "molecule", "PIOGLITAZONE", 3, Some(3),    None,    None),
+      ("PD", "disease", "targetDisease", 4, Some(3),    None,    None)
+    ).toDF("patientID", "category", "eventId", "startBucket", "deathBucket", "tracklossBucket", "diseaseBucket")
 
     // When
     val writer = MLPPWriter()
@@ -164,16 +205,18 @@ class MLPPWriterSuite extends SharedContext {
     )
 
     val input = Seq(
-      ("PA", Some(2), Some(3)),
-      ("PA", Some(2), Some(3)),
-      ("PB", Some(4), Some(3)),
-      ("PB", Some(4), Some(3)),
-      ("PC", None, Some(4)),
-      ("PC", None, Some(4)),
-      ("PD", Some(4), None),
-      ("PD", Some(4), None),
-      ("PE", None, None)
-    ).toDF("patientID", "deathBucket", "diseaseBucket")
+      ("PA", Some(2),    None, Some(3)),
+      ("PA", Some(2),    None, Some(3)),
+      ("PB", Some(4), Some(5), Some(3)),
+      ("PB", Some(4), Some(5), Some(3)),
+      ("PC",    None, Some(5), Some(4)),
+      ("PC",    None, Some(5), Some(4)),
+      ("PD", Some(5),    None, None),
+      ("PD", Some(5),    None, None),
+      ("PE", Some(7), Some(6), None),
+      ("PE", Some(7), Some(6), None),
+      ("PF",    None,    None, None)
+    ).toDF("patientID", "deathBucket", "tracklossBucket", "diseaseBucket")
 
     val expected = Seq(
       ("PA", Some(2)),
@@ -182,9 +225,11 @@ class MLPPWriterSuite extends SharedContext {
       ("PB", Some(3)),
       ("PC", Some(4)),
       ("PC", Some(4)),
-      ("PD", Some(4)),
-      ("PD", Some(4)),
-      ("PE", Some(16))
+      ("PD", Some(5)),
+      ("PD", Some(5)),
+      ("PE", Some(6)),
+      ("PE", Some(6)),
+      ("PF", Some(16))
     ).toDF("patientID", "endBucket")
 
     // When
diff --git a/src/test/scala/fr/polytechnique/cmap/cnam/utilities/ColumnUtilitiesSuite.scala b/src/test/scala/fr/polytechnique/cmap/cnam/utilities/ColumnUtilitiesSuite.scala
@@ -185,7 +185,7 @@ class ColumnUtilitiesSuite extends SharedContext{
       (Some(makeTS(2006, 1, 3)), Some(1)),
       (Some(makeTS(2006, 1, 10)), Some(4)),
       (Some(makeTS(2006, 1, 31)), Some(15)),
-      (Some(makeTS(2006, 2, 2)), Some(15)),
+      (Some(makeTS(2006, 2, 2)), None),
       (None, None)
     ).toDF("input", "output")
 

Original file line number	Diff line number	Diff line change
`@@ -41,8 +41,8 @@ object ColumnUtilities {`
`41`	`41`	`val lastBucket = if (bucketCount > 0) bucketCount - 1 else 0`
`42`	`42`
`43`	`43`	`val bucketId: Column = floor(datediff(column, lit(minTimestamp)) / lengthDays).cast(IntegerType)`
`44`		`- when(bucketId <= lastBucket \|\| bucketId.isNull, bucketId)`
`45`		`- .otherwise(lastBucket)`
	`44`	`+ when(bucketId.isNull \|\| bucketId.between(0, lastBucket), bucketId)`
	`45`	`+ //.otherwise(lastBucket)`
`46`	`46`	`}`
`47`	`47`	`}`
`48`	`48`	`}`