Update pipeline methods

satendrakumar · satendrakumar · commit 6db1a5c93809 · 2017-09-21T22:00:30.000-04:00
diff --git a/build.sbt b/build.sbt
@@ -7,7 +7,7 @@ scalaVersion := "2.11.8"
 
 libraryDependencies ++= Seq(
   "org.apache.spark" %% "spark-core" % "2.1.0",
-  "org.elasticsearch" %% "elasticsearch-spark-20" % "5.4.0",
+  "org.elasticsearch" %% "elasticsearch-spark-20" % "5.6.0",
   "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" artifacts(Artifact("stanford-corenlp", "models"), Artifact("stanford-corenlp")),
   "ch.qos.logback" % "logback-classic" % "1.2.3",
   "org.json4s" %% "json4s-native" % "3.5.0",
diff --git a/ipl-tweet.csv b/ipl-tweet.csv
diff --git a/src/main/scala/com/techmonad/pipeline/DataPipeline.scala b/src/main/scala/com/techmonad/pipeline/DataPipeline.scala
@@ -23,7 +23,7 @@ object DataPipeline {
       val sourceRDD = applySource(workFlow.source)
       val validatedRDD = applyValidation(sourceRDD, workFlow.validations)
       val transformedRDD = applyTransformation(validatedRDD, workFlow.transformations)
-      val schemaValidatedRDD = applySchemaValidation(transformedRDD, workFlow.schemaValidation)
+      val schemaValidatedRDD = applySchemaValidation(transformedRDD, workFlow.schemaValidations)
       applySink(schemaValidatedRDD, workFlow.sink)
     } match {
       case Success(sink) =>
@@ -37,7 +37,7 @@ object DataPipeline {
   }
 
   private def applySource(source: Source)(implicit sc: SparkContext) = {
-    CSVReader.read(source.dir)
+    CSVReader.read(source.path)
   }
 
   private def applyValidation(rdd: RDD[Record], validations: List[String]): RDD[Record] =
@@ -67,5 +67,5 @@ object DataPipeline {
     sink.`type` match {
       case "ES" => new ESPersistenceRDD(rdd)
     }
-  
+
 }
diff --git a/src/main/scala/com/techmonad/pipeline/RunDataPipeline.scala b/src/main/scala/com/techmonad/pipeline/RunDataPipeline.scala
@@ -7,13 +7,38 @@ import com.techmonad.pipeline.workflow.WorkFlow
 object RunDataPipeline extends App with SparkContextProvider with JsonHelper {
 
   val workFlowJson =
-    if (args.length < 1)
-      throw new IllegalArgumentException("Data directory and workflow json are required")
-    else
-      args(0)
+    """
+      |{
+      |  "source": {
+      |    "type": "CSV",
+      |    "path": "ipl-tweet.csv",
+      |    "meta":{"text_field":"text","date_field": "date","author_field":"author_name" }
+      |  },
+      |
+      |  "validations": [ "COLUMN_VALIDATION",  "FIELD_VALIDATION"  ],
+      |
+      |  "transformations": ["SENTIMENT_ANALYSIS"  ],
+      |
+      |  "schemaValidations": [    ],
+      |
+      |  "sink": {
+      |    "type": "ES",
+      |    "meta":{ "index": "data_index","type": "twitter"    }
+      |  }
+      |}
+    """.stripMargin
+
+
+  /*    if (args.length < 1)
+        throw new IllegalArgumentException("Data directory and workflow json are required")
+      else
+        args(0)*/
 
   val workFlow = parse(workFlowJson).extract[WorkFlow]
 
   DataPipeline(workFlow).run
+
+  sc.stop()
+
 }
 
diff --git a/src/main/scala/com/techmonad/pipeline/transformation/sentiment/NLPSentimentAnalyzer.scala b/src/main/scala/com/techmonad/pipeline/transformation/sentiment/NLPSentimentAnalyzer.scala
@@ -13,11 +13,12 @@ import edu.stanford.nlp.sentiment.SentimentCoreAnnotations
 import scala.collection.convert.wrapAll._
 
 
-object SentimentAnalyzer extends Transformation {
+object SentimentAnalyzer extends Transformation  with Serializable{
 
   override def transform(record: Record): Record =
     if (record.status != Status.ERROR) {
       val sentiment: String = NLPSentimentAnalyzer.getSentiment(record.data("text").toString)
+     println("Analyzing Sentiment........... " + sentiment)
       record.copy(data = record.data + ("sentiment" -> sentiment))
     } else {
       record
diff --git a/src/main/scala/com/techmonad/pipeline/util/SparkContextProvider.scala b/src/main/scala/com/techmonad/pipeline/util/SparkContextProvider.scala
@@ -7,6 +7,7 @@ trait SparkContextProvider {
 
   val conf = new SparkConf().setMaster("local[*]").setAppName("DataPipeline")
   implicit val sc = new SparkContext(conf)
+  sc.setLogLevel("WARN")
 
 
 }
diff --git a/src/main/scala/com/techmonad/pipeline/validation/source/MandatoryColumnValidation.scala b/src/main/scala/com/techmonad/pipeline/validation/source/MandatoryColumnValidation.scala
@@ -4,15 +4,15 @@ import com.techmonad.pipeline.Record
 import com.techmonad.pipeline.util.Status
 import com.techmonad.pipeline.validation.Validation
 
-object MandatoryColumnValidation extends Validation {
+object MandatoryColumnValidation extends Validation with Serializable{
 
 
   override def name: String = "COLUMN_VALIDATION"
 
   override def validate(record: Record): Record =
     record.data.get("text") match {
       case Some(text: String) if (text.trim.nonEmpty) =>
-        record.data.get("date") match {
+       record.data.get("date") match {
           case Some(date: String) if (date.trim.nonEmpty) =>
             record
           case None =>
diff --git a/src/main/scala/com/techmonad/pipeline/workflow/WorkFlows.scala b/src/main/scala/com/techmonad/pipeline/workflow/WorkFlows.scala
@@ -4,11 +4,11 @@ case class WorkFlow(
                      source: Source,
                      validations: List[String],
                      transformations: List[String],
-                     schemaValidation: List[String],
+                     schemaValidations: List[String],
                      sink: Sink
                    )
 
-case class Source(`type`: String, dir: String, meta: Map[String, String])
+case class Source(`type`: String, path: String, meta: Map[String, String])
 
 case class Sink(`type`: String, meta: Map[String, String])
 
@@ -18,7 +18,7 @@ case class Sink(`type`: String, meta: Map[String, String])
 |{
  |  "source": {
  |    "type": "CSV",
- |    "dir": "s3://data/bucket_name",
+ |    "path": "/home/satendra/decooda/testing-csv",
  |    "meta":{"text_field":"text","date_field": "date","author_field":"author_name" }
  |  },
  |
@@ -29,7 +29,7 @@ case class Sink(`type`: String, meta: Map[String, String])
  |  "schemaValidation": [    "DATA_MODEL_VALIDATION"  ],
  |
  |  "sink": {
- |    "storage_type": "ES",
+ |    "type": "ES",
  |    "meta":{ "index": "data_index","type": "twitter"    }
  |  }
  |}

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ object DataPipeline {`
`23`	`23`	`val sourceRDD = applySource(workFlow.source)`
`24`	`24`	`val validatedRDD = applyValidation(sourceRDD, workFlow.validations)`
`25`	`25`	`val transformedRDD = applyTransformation(validatedRDD, workFlow.transformations)`
`26`		`- val schemaValidatedRDD = applySchemaValidation(transformedRDD, workFlow.schemaValidation)`
	`26`	`+ val schemaValidatedRDD = applySchemaValidation(transformedRDD, workFlow.schemaValidations)`
`27`	`27`	`applySink(schemaValidatedRDD, workFlow.sink)`
`28`	`28`	`} match {`
`29`	`29`	`case Success(sink) =>`
`@@ -37,7 +37,7 @@ object DataPipeline {`
`37`	`37`	`}`
`38`	`38`
`39`	`39`	`private def applySource(source: Source)(implicit sc: SparkContext) = {`
`40`		`- CSVReader.read(source.dir)`
	`40`	`+ CSVReader.read(source.path)`
`41`	`41`	`}`
`42`	`42`
`43`	`43`	`private def applyValidation(rdd: RDD[Record], validations: List[String]): RDD[Record] =`
`@@ -67,5 +67,5 @@ object DataPipeline {`
`67`	`67`	sink.`type` match {
`68`	`68`	`case "ES" => new ESPersistenceRDD(rdd)`
`69`	`69`	`}`
`70`		`-`
	`70`	`+`
`71`	`71`	`}`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ trait SparkContextProvider {`
`7`	`7`
`8`	`8`	`val conf = new SparkConf().setMaster("local[*]").setAppName("DataPipeline")`
`9`	`9`	`implicit val sc = new SparkContext(conf)`
	`10`	`+ sc.setLogLevel("WARN")`
`10`	`11`
`11`	`12`
`12`	`13`	`}`