Merge pull request #180 from AbsaOSS/feature/179-copy-transformer

kevinwallimann · web-flow · commit 00d23a56efce · 2020-11-25T10:37:00.000+01:00
#179: Implement copy transformer
diff --git a/README.md b/README.md
@@ -183,6 +183,52 @@ component.transformer.class.{transformer-id} = za.co.absa.hyperdrive.ingestor.im
 | `transformer.{transformer-id}.columns.rename.from` | Yes | A comma-separated list of columns to rename. For example, `column1, column2`. |
 | `transformer.{transformer-id}.columns.rename.to` | Yes | A comma-separated list of new column names. For example, `column1_new, column2_new`. |
 
+##### ColumnCopyStreamTransformer
+`ColumnCopyStreamTransformer` allows copying of columns specified in the configuration. Dots in column names are interpreted as nested structs, 
+unless they are surrounded by backticks (same as Spark convention)
+
+Note that usage of the star-operator `*` within column names is not supported and may lead to unexpected behaviour.
+
+To add the transformer to the pipeline use this class name:
+```
+component.transformer.class.{transformer-id} = za.co.absa.hyperdrive.ingestor.implementation.transformer.column.copy.ColumnCopyStreamTransformer
+```
+
+| Property Name | Required | Description |
+| :--- | :---: | :--- |
+| `transformer.{transformer-id}.columns.copy.from` | Yes | A comma-separated list of columns to copy from. For example, `column1.fieldA, column2.fieldA`. |
+| `transformer.{transformer-id}.columns.copy.to` | Yes | A comma-separated list of new column names. For example, `newColumn.col1_fieldA, newColumn.col2_fieldA`. |
+
+**Example**
+
+Given a dataframe with the following schema
+```
+ |-- column1
+ |    |-- fieldA
+ |    |-- fieldB
+ |-- column2
+ |    |-- fieldA
+ |-- column3
+```
+
+Then, the following column parameters
+- `transformer.{transformer-id}.columns.copy.from=column1.fieldA, column2.fieldA`
+- `transformer.{transformer-id}.columns.copy.to=newColumn.col1_fieldA, newColumn.col2_fieldA`
+
+will produce the following schema
+```
+ |-- column1
+ |    |-- fieldA
+ |    |-- fieldB
+ |-- column2
+ |    |-- fieldA
+ |-- column3
+ |-- newColumn
+ |    |-- col1_fieldA
+ |    |-- col2_fieldA
+
+```
+
 See [Pipeline settings](#pipeline-settings) for details about `{transformer-id}`.
 ##### ParquetStreamWriter
 | Property Name | Required | Description |
diff --git a/driver/src/test/scala/za/co/absa/hyperdrive/driver/drivers/KafkaToKafkaDockerTest.scala b/driver/src/test/scala/za/co/absa/hyperdrive/driver/drivers/KafkaToKafkaDockerTest.scala
@@ -86,7 +86,9 @@ class KafkaToKafkaDockerTest extends FlatSpec with Matchers with SparkTestBase w
       "component.transformer.class.[avro.decoder]" -> "za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent.ConfluentAvroDecodingTransformer",
       "component.transformer.id.1" -> "column.selector",
       "component.transformer.class.column.selector" -> "za.co.absa.hyperdrive.ingestor.implementation.transformer.column.selection.ColumnSelectorStreamTransformer",
-      "component.transformer.id.2" -> "[avro.encoder]",
+      "component.transformer.id.2" -> "[column.copy]",
+      "component.transformer.class.[column.copy]" -> "za.co.absa.hyperdrive.ingestor.implementation.transformer.column.copy.ColumnCopyStreamTransformer",
+      "component.transformer.id.3" -> "[avro.encoder]",
       "component.transformer.class.[avro.encoder]" -> "za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent.ConfluentAvroEncodingTransformer",
       "component.writer" -> "za.co.absa.hyperdrive.ingestor.implementation.writer.kafka.KafkaStreamWriter",
 
@@ -108,6 +110,10 @@ class KafkaToKafkaDockerTest extends FlatSpec with Matchers with SparkTestBase w
       // comma separated list of columns to select
       "transformer.column.selector.columns.to.select" -> "*",
 
+      // copy transformer settings
+      "transformer.[column.copy].columns.copy.from" -> "some_id, value_field",
+      "transformer.[column.copy].columns.copy.to" -> "grouped.some_id, grouped.value_field",
+
       // Avro Encoder (ABRiS) settings
       "transformer.[avro.encoder].schema.registry.url" -> "${transformer.[avro.decoder].schema.registry.url}",
       "transformer.[avro.encoder].value.schema.naming.strategy" -> "topic.name",
@@ -144,9 +150,15 @@ class KafkaToKafkaDockerTest extends FlatSpec with Matchers with SparkTestBase w
       .asScala.map(_.getType) should contain theSameElementsAs Seq(Type.STRING, Type.NULL)
 
     val valueFieldNames = records.head.value().getSchema.getFields.asScala.map(_.name())
-    valueFieldNames should contain theSameElementsAs List("some_id", "value_field")
-    records.map(_.value().get("some_id")) should contain theSameElementsInOrderAs List.range(0, numberOfRecords)
-    records.map(_.value().get("value_field")).distinct should contain theSameElementsAs List(new Utf8("valueHello"))
+    valueFieldNames should contain theSameElementsAs List("some_id", "value_field", "grouped")
+    val allIds = records.map(_.value().get("some_id"))
+    val allValues = records.map(_.value().get("value_field"))
+    allIds should contain theSameElementsInOrderAs List.range(0, numberOfRecords)
+    allValues.distinct should contain theSameElementsAs List(new Utf8("valueHello"))
+    val idsInGrouped = records.map(_.value().get("grouped").asInstanceOf[GenericRecord].get("some_id"))
+    val valuesInGrouped = records.map(_.value().get("grouped").asInstanceOf[GenericRecord].get("value_field"))
+    idsInGrouped shouldBe allIds
+    valuesInGrouped shouldBe allValues
   }
 
   after {
diff --git a/ingestor-default/src/main/resources/META-INF/services/za.co.absa.hyperdrive.ingestor.api.transformer.StreamTransformerFactoryProvider b/ingestor-default/src/main/resources/META-INF/services/za.co.absa.hyperdrive.ingestor.api.transformer.StreamTransformerFactoryProvider
@@ -17,3 +17,4 @@ za.co.absa.hyperdrive.ingestor.implementation.transformer.dateversion.AddDateVer
 za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent.ConfluentAvroDecodingTransformerLoader
 za.co.absa.hyperdrive.ingestor.implementation.transformer.enceladus.columns.AddEnceladusColumnsTransformerLoader
 za.co.absa.hyperdrive.ingestor.implementation.transformer.column.renaming.ColumnRenamingStreamTransformerLoader
+za.co.absa.hyperdrive.ingestor.implementation.transformer.column.copy.ColumnCopyStreamTransformerLoader
diff --git a/ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/column/copy/ColumnCopyStreamTransformer.scala b/ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/column/copy/ColumnCopyStreamTransformer.scala
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.hyperdrive.ingestor.implementation.transformer.column.copy
+
+import org.apache.commons.configuration2.Configuration
+import org.apache.logging.log4j.LogManager
+import org.apache.spark.sql.{Column, DataFrame}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import za.co.absa.hyperdrive.ingestor.api.transformer.{StreamTransformer, StreamTransformerFactory}
+import za.co.absa.hyperdrive.ingestor.api.utils.ConfigUtils
+
+/**
+ * @param columnsFrom list of columns to copy from. Nested structs can be specified with dots
+ * @param columnsTo list of columns to copy from. Nested structs can be specified with dots
+ * Example:
+ * Given a dataframe with the following schema
+ * |-- orig1
+ * |    |-- nestedOrig1
+ * |-- orig2
+ * |    |-- nestedOrig2
+ * |-- orig3
+ * and the following parameters
+ *
+ * columnsFrom=Seq("orig1.nestedOrig","orig2")
+ * columnsTo=Seq("new1.nested1.nested11","new1.nested2")
+ *
+ * will produce the following schema
+ * |-- orig1
+ * |    |-- nestedOrig1
+ * |-- orig2
+ * |    |-- nestedOrig2
+ * |-- orig3
+ * |-- new1
+ * |    |-- nested1
+ * |    |    |-- nested11
+ * |    |-- nested2
+ * |    |    |-- nestedOrig2
+ *
+ */
+private[transformer] class ColumnCopyStreamTransformer(val columnsFrom: Seq[String], val columnsTo: Seq[String]) extends StreamTransformer {
+  case class Node(name: String, copyColumnName: Option[String], children: Seq[Node])
+
+  if (columnsFrom.size != columnsTo.size) {
+    throw new IllegalArgumentException("The size of source column names doesn't match the list of target column names " +
+      s"${columnsFrom.size} != ${columnsTo.size}.")
+  }
+
+  def transform(dataFrame: DataFrame): DataFrame = {
+    val parsedColumns = columnsTo.map(to => UnresolvedAttribute.parseAttributeName(to).toList)
+    val rootNodeName = "root"
+    val rootNode = Node(rootNodeName, None, Seq())
+    val copyColumnsList = columnsFrom.zip(parsedColumns.map(rootNodeName :: _))
+    val copyColumnsTreeRootNode = copyColumnsList.foldLeft(rootNode)((node, parsedColumn) =>
+      createNode(parsedColumn._1, parsedColumn._2, Some(node)))
+
+    copyColumnsTreeRootNode.children.foldLeft(dataFrame)((df, topNode) =>
+      df.withColumn(topNode.name, createColumn(topNode)))
+  }
+
+  private def createNode(sourceField: String, treePath: List[String], node: Option[Node]): Node = treePath match {
+    case last :: Nil => Node(last, Some(sourceField), Seq())
+    case head :: tail if node.isDefined && node.get.name == head =>
+      Node(head, None, node.get.children.filterNot(_.name == tail.head) :+
+        createNode(sourceField, tail, node.get.children.find(_.name == tail.head)))
+    case head :: tail => Node(head, None, Seq(createNode(sourceField, tail, None)))
+  }
+
+  private def createColumn(node: Node): Column = node.children match {
+    case Nil =>
+      val originalColumn = node.copyColumnName.getOrElse(
+        throw new IllegalStateException(s"Expected a copy column name at leaf node ${node}, got None"))
+      val newColumn = node.name
+      col(originalColumn).as(newColumn)
+    case children => struct(children.map(createColumn):_*).as(node.name)
+  }
+}
+
+object ColumnCopyStreamTransformer extends StreamTransformerFactory with ColumnCopyStreamTransformerAttributes {
+  override def apply(config: Configuration): StreamTransformer = {
+    val columnsFrom = ConfigUtils.getSeqOrThrow(KEY_COLUMNS_FROM, config)
+    val columnsTo = ConfigUtils.getSeqOrThrow(KEY_COLUMNS_TO, config)
+    LogManager.getLogger.info(s"Going to create ColumnRenamingStreamTransformer using: " +
+      s"columnsFrom='${columnsFrom.mkString(",")}', columnsTo='${columnsTo.mkString(",")}'")
+    new ColumnCopyStreamTransformer(columnsFrom, columnsTo)
+  }
+}
diff --git a/ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/column/copy/ColumnCopyStreamTransformerAttributes.scala b/ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/column/copy/ColumnCopyStreamTransformerAttributes.scala
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.hyperdrive.ingestor.implementation.transformer.column.copy
+
+import za.co.absa.hyperdrive.ingestor.api.{HasComponentAttributes, PropertyMetadata}
+
+trait ColumnCopyStreamTransformerAttributes extends HasComponentAttributes {
+  val KEY_COLUMNS_FROM = "columns.copy.from"
+  val KEY_COLUMNS_TO = "columns.copy.to"
+
+  override def getName: String = "Column Copy Transformer"
+
+  override def getDescription: String = "This transformer copies given columns. Column expressions are not possible"
+
+  override def getProperties: Map[String, PropertyMetadata] = Map(
+    KEY_COLUMNS_FROM -> PropertyMetadata("Source column names", Some("Comma separated list of columns to be copied."), required = true),
+    KEY_COLUMNS_TO -> PropertyMetadata("Target column names", Some("Comma separated list of new names of the columns. The number of columns should match the list of source columns."), required = true)
+  )
+}
diff --git a/ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/column/copy/ColumnCopyStreamTransformerLoader.scala b/ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/column/copy/ColumnCopyStreamTransformerLoader.scala
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.hyperdrive.ingestor.implementation.transformer.column.copy
+
+import za.co.absa.hyperdrive.ingestor.api.transformer.{StreamTransformerFactory, StreamTransformerFactoryProvider}
+
+class ColumnCopyStreamTransformerLoader extends StreamTransformerFactoryProvider {
+  override def getComponentFactory: StreamTransformerFactory = ColumnCopyStreamTransformer
+}
diff --git a/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/TestServiceProviderConfiguration.scala b/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/TestServiceProviderConfiguration.scala
@@ -24,6 +24,7 @@ import za.co.absa.hyperdrive.ingestor.api.writer.{StreamWriterFactory, StreamWri
 import za.co.absa.hyperdrive.ingestor.api.{ComponentFactory, ComponentFactoryProvider}
 import za.co.absa.hyperdrive.ingestor.implementation.reader.kafka.KafkaStreamReader
 import za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent.ConfluentAvroDecodingTransformer
+import za.co.absa.hyperdrive.ingestor.implementation.transformer.column.copy.ColumnCopyStreamTransformer
 import za.co.absa.hyperdrive.ingestor.implementation.transformer.column.renaming.ColumnRenamingStreamTransformer
 import za.co.absa.hyperdrive.ingestor.implementation.transformer.column.selection.ColumnSelectorStreamTransformer
 import za.co.absa.hyperdrive.ingestor.implementation.transformer.dateversion.AddDateVersionTransformer
@@ -44,8 +45,14 @@ class TestServiceProviderConfiguration extends FlatSpec with Matchers {
 
   it should "load StreamTransformers" in {
     val factoryProviders = loadServices[StreamTransformerFactoryProvider, StreamTransformerFactory]()
-    factoryProviders should contain theSameElementsAs Seq(AddDateVersionTransformer,
-      ColumnSelectorStreamTransformer, ConfluentAvroDecodingTransformer, AddEnceladusColumnsTransformer, ColumnRenamingStreamTransformer)
+    factoryProviders should contain theSameElementsAs Seq(
+      AddDateVersionTransformer,
+      ColumnSelectorStreamTransformer,
+      ConfluentAvroDecodingTransformer,
+      AddEnceladusColumnsTransformer,
+      ColumnRenamingStreamTransformer,
+      ColumnCopyStreamTransformer
+    )
   }
 
   it should "load StreamWriters" in {
diff --git a/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/column/copy/TestColumnCopyStreamTransformer.scala b/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/column/copy/TestColumnCopyStreamTransformer.scala
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.hyperdrive.ingestor.implementation.transformer.column.copy
+
+import org.apache.commons.configuration2.DynamicCombinedConfiguration
+import org.apache.commons.configuration2.convert.DefaultListDelimiterHandler
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
+import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}
+import za.co.absa.commons.spark.SparkTestBase
+
+class TestColumnCopyStreamTransformer extends FlatSpec with SparkTestBase with Matchers with BeforeAndAfter {
+
+  it should "copy columns while leaving existing columns intact" in {
+    // given
+    val inputSchema = new StructType()
+      .add("col1Top", new StructType()
+        .add("Field.1", StringType, nullable = false)
+      , nullable = false)
+      .add("col2Top", new StructType()
+        .add("Field2", StringType)
+        .add("Field3", new ArrayType(IntegerType, containsNull = true)))
+      .add("col3Top", StringType)
+    val memoryStream = new MemoryStream[Row](1, spark.sqlContext)(RowEncoder(inputSchema))
+    val df = memoryStream.toDF()
+
+    val config = new DynamicCombinedConfiguration()
+    config.setListDelimiterHandler(new DefaultListDelimiterHandler(','))
+    config.addProperty(ColumnCopyStreamTransformer.KEY_COLUMNS_FROM,
+      "col1Top.`Field.1`, col2Top, col2Top, col2Top.Field2, col2Top.Field3")
+    config.addProperty(ColumnCopyStreamTransformer.KEY_COLUMNS_TO,
+      "copy1.`cp.col1Top`.cpField1, copy1.cpCol2Top, copy2, copy2.l1.l2.l3.Field2, copy2.l1.l2.Field3")
+
+    // when
+    val resultDf = ColumnCopyStreamTransformer(config).transform(df)
+
+    // then
+    val expectedSchema = new StructType()
+      .add("col1Top", new StructType()
+        .add("Field.1", StringType, nullable = false)
+      , nullable = false)
+      .add("col2Top", new StructType()
+        .add("Field2", StringType)
+        .add("Field3", new ArrayType(IntegerType, containsNull = true)))
+      .add("col3Top", StringType)
+      .add("copy1", new StructType()
+        .add("cp.col1Top", new StructType()
+          .add("cpField1", StringType, nullable = false),
+          nullable = false)
+        .add("cpCol2Top", new StructType()
+          .add("Field2", StringType)
+          .add("Field3", new ArrayType(IntegerType, containsNull = true))),
+        nullable = false)
+      .add("copy2", new StructType()
+        .add("l1", new StructType()
+          .add("l2", new StructType()
+            .add("l3", new StructType()
+              .add("Field2", StringType)
+              , nullable = false)
+            .add("Field3", new ArrayType(IntegerType, containsNull = true))
+            , nullable = false)
+          , nullable = false)
+        , nullable = false)
+    resultDf.schema shouldBe expectedSchema
+  }
+
+  it should "throw an exception if columns from do not match columns to" in {
+    val config = new DynamicCombinedConfiguration()
+    config.setListDelimiterHandler(new DefaultListDelimiterHandler(','))
+    config.addProperty(ColumnCopyStreamTransformer.KEY_COLUMNS_FROM, "col1,col2")
+    config.addProperty(ColumnCopyStreamTransformer.KEY_COLUMNS_TO, "col1")
+
+    val ex = the[IllegalArgumentException] thrownBy ColumnCopyStreamTransformer(config)
+    ex.getMessage should include("The size of source column names doesn't match")
+  }
+}
diff --git a/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/column/copy/TestColumnCopyStreamTransformerObject.scala b/ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/column/copy/TestColumnCopyStreamTransformerObject.scala