Merge pull request #175 from AbsaOSS/feature/174-fix-unevaluable-name-placeholder

kevinwallimann · web-flow · commit b8ecdfc29e2d · 2020-11-19T13:29:52.000+01:00
#174: Evaluate column names before creating schema
diff --git a/ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/avro/confluent/ConfluentAvroEncodingTransformer.scala b/ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/avro/confluent/ConfluentAvroEncodingTransformer.scala
@@ -66,7 +66,7 @@ private[transformer] class ConfluentAvroEncodingTransformer(
   }
 
   private def getValueDataFrame(dataFrame: DataFrame): DataFrame = {
-    val allColumns = struct(dataFrame.columns.head, dataFrame.columns.tail: _*)
+    val allColumns = struct(dataFrame.columns.map(c => dataFrame(c)): _*)
     val toAvroConfig = getValueAvroConfig(config, allColumns.expr)
     logger.info(s"ToAvro settings: $toAvroConfig")
     dataFrame.select(to_avro(allColumns, toAvroConfig) as 'value)
diff --git a/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/testutils/HyperdriveMockSchemaRegistryClient.scala b/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/testutils/HyperdriveMockSchemaRegistryClient.scala
@@ -13,14 +13,14 @@
  * limitations under the License.
  */
 
-package za.co.absa.hyperdrive.ingestor.implementation.utils
+package za.co.absa.hyperdrive.ingestor.implementation.testutils
 
 import java.io.IOException
 
-import io.confluent.kafka.schemaregistry.client.{MockSchemaRegistryClient, SchemaMetadata}
 import io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException
+import io.confluent.kafka.schemaregistry.client.{MockSchemaRegistryClient, SchemaMetadata}
 
-private class AbrisMockSchemaRegistryClient extends MockSchemaRegistryClient {
+class HyperdriveMockSchemaRegistryClient extends MockSchemaRegistryClient {
 
   /**
    * MockSchemaRegistryClient is throwing different Exception than the mocked client, this is a workaround
diff --git a/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/avro/confluent/TestConfluentAvroEncodingTransformer.scala b/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/avro/confluent/TestConfluentAvroEncodingTransformer.scala
@@ -16,27 +16,71 @@
 package za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent
 
 import org.apache.commons.configuration2.BaseConfiguration
-import org.scalatest.{FlatSpec, Matchers}
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.streaming.Trigger
+import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}
+import za.co.absa.abris.avro.read.confluent.SchemaManagerFactory
+import za.co.absa.abris.config.AbrisConfig
+import za.co.absa.commons.spark.SparkTestBase
+import za.co.absa.hyperdrive.ingestor.implementation.testutils.HyperdriveMockSchemaRegistryClient
 import za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent.ConfluentAvroEncodingTransformer._
 import za.co.absa.hyperdrive.ingestor.implementation.utils.AbrisConfigUtil
 import za.co.absa.hyperdrive.ingestor.implementation.writer.kafka.KafkaStreamWriter
 
-class TestConfluentAvroEncodingTransformer extends FlatSpec with Matchers {
+class TestConfluentAvroEncodingTransformer extends FlatSpec with Matchers with BeforeAndAfter with SparkTestBase {
 
   private val topic = "topic"
-  private val schemaRegistryURL = "http://localhost:8081"
+  private val SchemaRegistryURL = "http://localhost:8081"
 
   behavior of ConfluentAvroEncodingTransformer.getClass.getSimpleName
 
+  before {
+    val mockSchemaRegistryClient = new HyperdriveMockSchemaRegistryClient()
+    SchemaManagerFactory.resetSRClientInstance()
+    SchemaManagerFactory.addSRClientInstance(Map(AbrisConfig.SCHEMA_REGISTRY_URL -> SchemaRegistryURL), mockSchemaRegistryClient)
+  }
+
   it should "create avro stream encoder" in {
     val config = new BaseConfiguration
     config.addProperty(KafkaStreamWriter.KEY_TOPIC, topic)
-    config.addProperty(KEY_SCHEMA_REGISTRY_URL, schemaRegistryURL)
+    config.addProperty(KEY_SCHEMA_REGISTRY_URL, SchemaRegistryURL)
     config.addProperty(KEY_SCHEMA_REGISTRY_VALUE_NAMING_STRATEGY, AbrisConfigUtil.TopicNameStrategy)
 
     val encoder = ConfluentAvroEncodingTransformer(config).asInstanceOf[ConfluentAvroEncodingTransformer]
 
     encoder.config shouldBe config
     encoder.withKey shouldBe false
   }
+
+  it should "encode the values" in {
+    // given
+    import spark.implicits._
+    val queryName = "dummyQuery"
+    val input = MemoryStream[Int](1, spark.sqlContext)
+    input.addData(1 to 100)
+    val df = input.toDF()
+
+    // when
+    val config = new BaseConfiguration()
+    config.addProperty(KafkaStreamWriter.KEY_TOPIC, topic)
+    config.addProperty(KEY_SCHEMA_REGISTRY_URL, SchemaRegistryURL)
+    config.addProperty(KEY_SCHEMA_REGISTRY_VALUE_NAMING_STRATEGY, AbrisConfigUtil.TopicNameStrategy)
+
+    val encoder = ConfluentAvroEncodingTransformer(config)
+    val transformedDf = encoder.transform(df)
+    val query = transformedDf
+      .writeStream
+      .trigger(Trigger.Once)
+      .queryName(queryName)
+      .format("memory")
+      .start()
+    query.awaitTermination()
+
+    // then
+    import spark.implicits._
+    val outputDf = spark.sql(s"select * from $queryName")
+    outputDf.count() shouldBe 100
+    val byteArrays = outputDf.select("value").map(_ (0).asInstanceOf[Array[Byte]]).collect()
+    byteArrays.distinct.length shouldBe byteArrays.length
+  }
 }
diff --git a/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/utils/TestAbrisConfigUtil.scala b/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/utils/TestAbrisConfigUtil.scala
@@ -25,6 +25,7 @@ import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}
 import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
 import za.co.absa.abris.avro.read.confluent.SchemaManagerFactory
 import za.co.absa.abris.config.AbrisConfig
+import za.co.absa.hyperdrive.ingestor.implementation.testutils.HyperdriveMockSchemaRegistryClient
 
 class TestAbrisConfigUtil extends FlatSpec with Matchers with BeforeAndAfter {
 
@@ -78,7 +79,7 @@ class TestAbrisConfigUtil extends FlatSpec with Matchers with BeforeAndAfter {
   behavior of AbrisConfigUtil.getClass.getName
 
   before {
-    mockSchemaRegistryClient = new AbrisMockSchemaRegistryClient()
+    mockSchemaRegistryClient = new HyperdriveMockSchemaRegistryClient()
     SchemaManagerFactory.resetSRClientInstance()
     SchemaManagerFactory.addSRClientInstance(Map(AbrisConfig.SCHEMA_REGISTRY_URL -> dummySchemaRegistryUrl), mockSchemaRegistryClient)
   }

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ private[transformer] class ConfluentAvroEncodingTransformer(`
`66`	`66`	`}`
`67`	`67`
`68`	`68`	`private def getValueDataFrame(dataFrame: DataFrame): DataFrame = {`
`69`		`- val allColumns = struct(dataFrame.columns.head, dataFrame.columns.tail: _*)`
	`69`	`+ val allColumns = struct(dataFrame.columns.map(c => dataFrame(c)): _*)`
`70`	`70`	`val toAvroConfig = getValueAvroConfig(config, allColumns.expr)`
`71`	`71`	`logger.info(s"ToAvro settings: $toAvroConfig")`
`72`	`72`	`dataFrame.select(to_avro(allColumns, toAvroConfig) as 'value)`