Fixed TopicSourceParquetSink

redsk · redsk · commit 965de82d3409 · 2018-09-18T13:15:07.000+02:00
- throwing exception in doUpdateDf in TopicSourceParquetSink if empty df
  so we can retry to read the parquet file (in SinkData)
- ConfluentSparkAvroUtils factory moved to appropriate file
diff --git a/src/main/scala/com/databricks/spark/avro/ConfluentSparkAvroUtils.scala b/src/main/scala/com/databricks/spark/avro/ConfluentSparkAvroUtils.scala
@@ -35,6 +35,7 @@ import scala.collection.JavaConverters._
 import scalaz.Memo
 import play.api.libs.json.Json
 
+import scala.collection.mutable
 import scala.util.Try
 
 /**
@@ -128,4 +129,21 @@ class ConfluentSparkAvroUtils(schemaRegistryURLs: String) extends Serializable {
       schemaAndType =>
         SchemaConverters.createConverterToSQL(schemaAndType._1, schemaAndType._2)
     }
-}
+}
+
+/**
+  * Factory for [[ConfluentSparkAvroUtils]].
+  */
+object ConfluentSparkAvroUtils {
+
+  val avroRegistries: mutable.Map[String, ConfluentSparkAvroUtils] =
+    mutable.Map[String, ConfluentSparkAvroUtils]()
+
+  def apply(schemaRegistryURL: String): ConfluentSparkAvroUtils = {
+    avroRegistries
+      .getOrElseUpdate(
+        schemaRegistryURL,
+        new ConfluentSparkAvroUtils(schemaRegistryURL)
+      )
+  }
+}
diff --git a/src/main/scala/com/haufe/umantis/ds/sources/kafka/KafkaDeserializer.scala b/src/main/scala/com/haufe/umantis/ds/sources/kafka/KafkaDeserializer.scala
@@ -28,7 +28,7 @@ class KafkaDeserializer(conf: TopicConf) {
 
   lazy val avroUtils: Option[ConfluentSparkAvroUtils] =
     conf.kafkaConf.schemaRegistryURL match {
-      case Some(url) => Some(TopicSourceParquetSink.getAvroUtils(url))
+      case Some(url) => Some(ConfluentSparkAvroUtils(url))
       case _ => None
     }
 
diff --git a/src/main/scala/com/haufe/umantis/ds/sources/kafka/SinkData.scala b/src/main/scala/com/haufe/umantis/ds/sources/kafka/SinkData.scala
@@ -46,22 +46,16 @@ trait SinkData extends Source {
       log("Updating DataFrame")
       //      Thread.sleep(100)
 
-      sink match {
-        case Some(_) =>
-          (1 to 30).foreach(retryNr => {
-            log(s"Reading Fresh Data Try # $retryNr")
-
-            try{
-              return doUpdateDf()
-            } catch {
-              case _: AnalysisException =>
-                Thread.sleep(1000)
-            }
-          })
-        case _ =>
-          // in case isReadOnly == true
+      (1 to 30).foreach(retryNr => {
+        log(s"Reading Fresh Data Try # $retryNr")
+
+        try{
           return doUpdateDf()
-      }
+        } catch {
+          case _: AnalysisException => Thread.sleep(1000)
+          case _: NoSuchElementException => Thread.sleep(1000)
+        }
+      })
 
       throw new KafkaTopicNotAvailableException(
         s"Kafka topic ${conf.kafkaTopic.topic} not ready!")
diff --git a/src/main/scala/com/haufe/umantis/ds/sources/kafka/TopicSourceParquetSink.scala b/src/main/scala/com/haufe/umantis/ds/sources/kafka/TopicSourceParquetSink.scala
@@ -74,6 +74,8 @@ extends TopicSourceSink(conf)
 
           outputSchema = sourceDf.schema
 
+          sourceDf.printSchema()
+
           val s = sourceDf
             .writeStream
             .outputMode("append")
@@ -181,11 +183,18 @@ extends TopicSourceSink(conf)
         .sql(s"select * from parquet.`$fname`")
         .toDF()
     } else {
-      currentSparkSession
+      val df = currentSparkSession
         .read
         .schema(outputSchema)
         .format("parquet")
         .load(fname)
+
+      // if the df is empty (because no data has been written yet)
+      // we want to trigger an exception (caught in SinkData.updateDf()
+      // so that we can retry to read the df
+      df.head()
+
+      df
     }
       .repartition(conf.sinkConf.numPartitions)
 
@@ -208,24 +217,6 @@ extends TopicSourceSink(conf)
   }
 }
 
-/**
-  * Factory for [[TopicSourceParquetSink]].
-  */
-object TopicSourceParquetSink {
-
-  val avroRegistries: mutable.Map[String, ConfluentSparkAvroUtils] =
-    mutable.Map[String, ConfluentSparkAvroUtils]()
-
-  def getAvroUtils(schemaRegistryURL: String): ConfluentSparkAvroUtils = {
-    avroRegistries
-      .getOrElseUpdate(
-        schemaRegistryURL,
-        new ConfluentSparkAvroUtils(schemaRegistryURL)
-      )
-  }
-}
-
-
 
 /**
   * Exception thrown if a Kafka topic is not available.
diff --git a/src/test/scala/com/haufe/umantis/ds/sources/kafka/TopicSourceEventSourcingSpec.scala b/src/test/scala/com/haufe/umantis/ds/sources/kafka/TopicSourceEventSourcingSpec.scala
@@ -88,7 +88,7 @@ trait TopicSourceEventSourcingSpec
         ).pipeline
       )
   }
-  def sinkConf = SinkConf(transformationFunction, 1 /* seconds */, 4 /* num partitions */)
+  def sinkConf = SinkConf(transformationFunction, refreshTime = 1 /* seconds */, numPartitions = 4)
   def conf = TopicConf(kafkaConf, topicName, sinkConf)
   def ts: TopicSourceSink
 

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ class KafkaDeserializer(conf: TopicConf) {`
`28`	`28`
`29`	`29`	`lazy val avroUtils: Option[ConfluentSparkAvroUtils] =`
`30`	`30`	`conf.kafkaConf.schemaRegistryURL match {`
`31`		`- case Some(url) => Some(TopicSourceParquetSink.getAvroUtils(url))`
	`31`	`+ case Some(url) => Some(ConfluentSparkAvroUtils(url))`
`32`	`32`	`case _ => None`
`33`	`33`	`}`
`34`	`34`
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ trait TopicSourceEventSourcingSpec`
`88`	`88`	`).pipeline`
`89`	`89`	`)`
`90`	`90`	`}`
`91`		`- def sinkConf = SinkConf(transformationFunction, 1 /* seconds /, 4 / num partitions */)`
	`91`	`+ def sinkConf = SinkConf(transformationFunction, refreshTime = 1 /* seconds */, numPartitions = 4)`
`92`	`92`	`def conf = TopicConf(kafkaConf, topicName, sinkConf)`
`93`	`93`	`def ts: TopicSourceSink`
`94`	`94`