Skip to content

Commit 726b1a1

Browse files
Upgrade to Abris v4.0.0 (#169)
* Upgrade to Abris v4.0.0 * Fix merge error
1 parent 8cf8e3c commit 726b1a1

File tree

14 files changed

+584
-415
lines changed

14 files changed

+584
-415
lines changed

driver/src/test/scala/za/co/absa/hyperdrive/driver/drivers/KafkaToKafkaDockerTest.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ class KafkaToKafkaDockerTest extends FlatSpec with Matchers with SparkTestBase w
150150
}
151151

152152
after {
153-
SchemaManagerFactory.resetClientInstance()
153+
SchemaManagerFactory.resetSRClientInstance()
154154
}
155155

156156
def createProducer(kafkaSchemaRegistryWrapper: KafkaSchemaRegistryWrapper): KafkaProducer[GenericRecord, GenericRecord] = {

driver/src/test/scala/za/co/absa/hyperdrive/driver/drivers/KafkaToParquetDockerTest.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ class KafkaToParquetDockerTest extends FlatSpec with Matchers with SparkTestBase
124124
}
125125

126126
after {
127-
SchemaManagerFactory.resetClientInstance()
127+
SchemaManagerFactory.resetSRClientInstance()
128128
}
129129

130130
private def createProducer(kafkaSchemaRegistryWrapper: KafkaSchemaRegistryWrapper): KafkaProducer[Int, GenericRecord] = {

driver/src/test/scala/za/co/absa/hyperdrive/driver/drivers/KafkaToParquetIncrementingVersionDockerTest.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ class KafkaToParquetIncrementingVersionDockerTest extends FlatSpec with Matchers
124124
}
125125

126126
after {
127-
SchemaManagerFactory.resetClientInstance()
127+
SchemaManagerFactory.resetSRClientInstance()
128128
}
129129

130130
private def produceMessage(numberOfRecords: Int, producer: KafkaProducer[Int, GenericRecord], schema: Schema, topic: String) = {

ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/avro/confluent/ConfluentAvroDecodingTransformer.scala

Lines changed: 21 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -16,47 +16,40 @@
1616
package za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent
1717

1818
import org.apache.commons.configuration2.Configuration
19-
import org.apache.commons.lang3.{RandomStringUtils, StringUtils}
19+
import org.apache.commons.lang3.RandomStringUtils
2020
import org.apache.logging.log4j.LogManager
2121
import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull
2222
import org.apache.spark.sql.functions.col
2323
import org.apache.spark.sql.{Column, DataFrame}
24-
import za.co.absa.abris.avro.functions.from_confluent_avro
25-
import za.co.absa.abris.avro.read.confluent.SchemaManager._
24+
import za.co.absa.abris.avro.functions.from_avro
25+
import za.co.absa.abris.config.FromAvroConfig
2626
import za.co.absa.hyperdrive.ingestor.api.context.HyperdriveContext
2727
import za.co.absa.hyperdrive.ingestor.api.transformer.{StreamTransformer, StreamTransformerFactory}
2828
import za.co.absa.hyperdrive.ingestor.api.utils.ConfigUtils
29-
import za.co.absa.hyperdrive.ingestor.api.utils.ConfigUtils.getOrThrow
3029
import za.co.absa.hyperdrive.ingestor.implementation.HyperdriveContextKeys
31-
import za.co.absa.hyperdrive.ingestor.implementation.utils.{SchemaRegistryConsumerConfigKeys, SchemaRegistrySettingsUtil}
3230
import za.co.absa.hyperdrive.ingestor.implementation.reader.kafka.KafkaStreamReader.KEY_TOPIC
31+
import za.co.absa.hyperdrive.ingestor.implementation.utils.{SchemaRegistryConsumerConfigKeys, AbrisConfigUtil}
3332

3433
private[transformer] class ConfluentAvroDecodingTransformer(
35-
val valueSchemaRegistrySettings: Map[String, String],
36-
val keySchemaRegistrySettings: Option[Map[String, String]])
34+
val valueAvroConfig: FromAvroConfig,
35+
val keyAvroConfigOpt: Option[FromAvroConfig])
3736
extends StreamTransformer {
3837

39-
if (valueSchemaRegistrySettings.isEmpty) {
40-
throw new IllegalArgumentException(
41-
"Empty Schema Registry settings received.")
42-
}
43-
4438
private val logger = LogManager.getLogger
4539

4640
override def transform(dataFrame: DataFrame): DataFrame = {
47-
logger.info(s"SchemaRegistry settings: $valueSchemaRegistrySettings")
41+
logger.info(s"SchemaRegistry settings: $valueAvroConfig")
4842

49-
keySchemaRegistrySettings match {
50-
case Some(keySettings) => getKeyValueDataFrame(dataFrame, keySettings)
43+
keyAvroConfigOpt match {
44+
case Some(keyAvroConfig) => getKeyValueDataFrame(dataFrame, keyAvroConfig)
5145
case None => getValueDataFrame(dataFrame)
5246
}
5347
}
5448

55-
private def getKeyValueDataFrame(dataFrame: DataFrame,
56-
keySchemaRegistrySettings: Map[String, String]) = {
49+
private def getKeyValueDataFrame(dataFrame: DataFrame, keyAvroConfig: FromAvroConfig) = {
5750
val decodedDf = dataFrame.select(
58-
from_confluent_avro(col("key"), keySchemaRegistrySettings) as 'key,
59-
from_confluent_avro(col("value"), valueSchemaRegistrySettings) as 'value)
51+
from_avro(col("key"), keyAvroConfig) as 'key,
52+
from_avro(col("value"), valueAvroConfig) as 'value)
6053
val keyValueDf = setColumnNonNullable(decodedDf, "value")
6154

6255
val keyColumnNames = keyValueDf.select("key.*").columns.toSeq
@@ -73,7 +66,7 @@ private[transformer] class ConfluentAvroDecodingTransformer(
7366

7467
private def getValueDataFrame(dataFrame: DataFrame) = {
7568
val decodedDf = dataFrame
76-
.select(from_confluent_avro(col("value"), valueSchemaRegistrySettings) as 'data)
69+
.select(from_avro(col("value"), valueAvroConfig) as 'data)
7770
setColumnNonNullable(decodedDf, "data")
7871
.select("data.*")
7972
}
@@ -89,54 +82,35 @@ private[transformer] class ConfluentAvroDecodingTransformer(
8982
object ConfluentAvroDecodingTransformer extends StreamTransformerFactory with ConfluentAvroDecodingTransformerAttributes {
9083
private val keyColumnPrefixLength = 4
9184

92-
object ValueSchemaConfigKeys extends SchemaRegistryConsumerConfigKeys {
85+
object SchemaConfigKeys extends SchemaRegistryConsumerConfigKeys {
86+
override val topic: String = KEY_TOPIC
9387
override val schemaRegistryUrl: String = KEY_SCHEMA_REGISTRY_URL
9488
override val schemaId: String = KEY_SCHEMA_REGISTRY_VALUE_SCHEMA_ID
9589
override val namingStrategy: String = KEY_SCHEMA_REGISTRY_VALUE_NAMING_STRATEGY
9690
override val recordName: String = KEY_SCHEMA_REGISTRY_VALUE_RECORD_NAME
9791
override val recordNamespace: String = KEY_SCHEMA_REGISTRY_VALUE_RECORD_NAMESPACE
98-
override val paramSchemaId: String = PARAM_VALUE_SCHEMA_ID
99-
override val paramSchemaNamingStrategy: String = PARAM_VALUE_SCHEMA_NAMING_STRATEGY
100-
override val paramSchemaNameForRecordStrategy: String = PARAM_VALUE_SCHEMA_NAME_FOR_RECORD_STRATEGY
101-
override val paramSchemaNamespaceForRecordStrategy: String = PARAM_VALUE_SCHEMA_NAMESPACE_FOR_RECORD_STRATEGY
102-
}
103-
104-
object KeySchemaConfigKeys extends SchemaRegistryConsumerConfigKeys {
105-
override val schemaRegistryUrl: String = KEY_SCHEMA_REGISTRY_URL
106-
override val schemaId: String = KEY_SCHEMA_REGISTRY_KEY_SCHEMA_ID
107-
override val namingStrategy: String = KEY_SCHEMA_REGISTRY_KEY_NAMING_STRATEGY
108-
override val recordName: String = KEY_SCHEMA_REGISTRY_KEY_RECORD_NAME
109-
override val recordNamespace: String = KEY_SCHEMA_REGISTRY_KEY_RECORD_NAMESPACE
110-
override val paramSchemaId: String = PARAM_KEY_SCHEMA_ID
111-
override val paramSchemaNamingStrategy: String = PARAM_KEY_SCHEMA_NAMING_STRATEGY
112-
override val paramSchemaNameForRecordStrategy: String = PARAM_KEY_SCHEMA_NAME_FOR_RECORD_STRATEGY
113-
override val paramSchemaNamespaceForRecordStrategy: String = PARAM_KEY_SCHEMA_NAMESPACE_FOR_RECORD_STRATEGY
11492
}
11593

11694
override def apply(config: Configuration): StreamTransformer = {
117-
val topic = getTopic(config)
118-
val valueSchemaRegistrySettings = SchemaRegistrySettingsUtil.getConsumerSettings(config, topic, ValueSchemaConfigKeys)
95+
val valueAvroConfig = AbrisConfigUtil.getValueConsumerSettings(config, SchemaConfigKeys)
11996

12097
val consumeKeys = ConfigUtils.getOptionalBoolean(KEY_CONSUME_KEYS, config).getOrElse(false)
121-
val keySchemaRegistrySettingsOpt = if (consumeKeys) {
122-
Some(SchemaRegistrySettingsUtil.getConsumerSettings(config, topic, KeySchemaConfigKeys))
98+
val keyAvroConfigOpt = if (consumeKeys) {
99+
Some(AbrisConfigUtil.getKeyConsumerSettings(config, SchemaConfigKeys))
123100
} else {
124101
None
125102
}
126103
LogManager.getLogger.info(
127-
s"Going to create ConfluentAvroDecodingTransformer instance using: topic='$topic', " +
128-
s"value schema registry settings='$valueSchemaRegistrySettings', key schema registry settings='$keySchemaRegistrySettingsOpt'.")
104+
s"Going to create ConfluentAvroDecodingTransformer instance using " +
105+
s"value avro config='$valueAvroConfig', key avro config='$keyAvroConfigOpt'.")
129106

130-
new ConfluentAvroDecodingTransformer(valueSchemaRegistrySettings, keySchemaRegistrySettingsOpt)
107+
new ConfluentAvroDecodingTransformer(valueAvroConfig, keyAvroConfigOpt)
131108
}
132109

133110
override def getMappingFromRetainedGlobalConfigToLocalConfig(globalConfig: Configuration): Map[String, String] = Map(
134111
KEY_TOPIC -> KEY_TOPIC
135112
)
136113

137-
private def getTopic(configuration: Configuration): String =
138-
getOrThrow(KEY_TOPIC, configuration, errorMessage = s"Topic not found. Is '$KEY_TOPIC' properly set?")
139-
140114
def determineKeyColumnPrefix(valueColumnNames: Seq[String]): String = {
141115
var candidatePrefix = "key__"
142116
while (valueColumnNames.exists(c => c.startsWith(candidatePrefix))) {

ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/transformer/avro/confluent/ConfluentAvroEncodingTransformer.scala

Lines changed: 34 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,38 +18,34 @@ package za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent
1818
import org.apache.commons.configuration2.Configuration
1919
import org.apache.logging.log4j.LogManager
2020
import org.apache.spark.sql.DataFrame
21-
import org.apache.spark.sql.functions.{col, struct}
22-
import za.co.absa.abris.avro.functions.to_confluent_avro
23-
import za.co.absa.abris.avro.read.confluent.SchemaManager.{PARAM_KEY_SCHEMA_ID, PARAM_KEY_SCHEMA_NAMESPACE_FOR_RECORD_STRATEGY, PARAM_KEY_SCHEMA_NAME_FOR_RECORD_STRATEGY, PARAM_KEY_SCHEMA_NAMING_STRATEGY, PARAM_VALUE_SCHEMA_ID, PARAM_VALUE_SCHEMA_NAME_FOR_RECORD_STRATEGY, PARAM_VALUE_SCHEMA_NAMESPACE_FOR_RECORD_STRATEGY, PARAM_VALUE_SCHEMA_NAMING_STRATEGY}
21+
import org.apache.spark.sql.catalyst.expressions.Expression
22+
import org.apache.spark.sql.functions.struct
23+
import za.co.absa.abris.avro.functions.to_avro
24+
import za.co.absa.abris.config.ToAvroConfig
2425
import za.co.absa.hyperdrive.ingestor.api.context.HyperdriveContext
2526
import za.co.absa.hyperdrive.ingestor.api.transformer.{StreamTransformer, StreamTransformerFactory}
2627
import za.co.absa.hyperdrive.ingestor.api.utils.ConfigUtils
2728
import za.co.absa.hyperdrive.ingestor.implementation.HyperdriveContextKeys
28-
import za.co.absa.hyperdrive.ingestor.implementation.utils.{SchemaRegistryProducerConfigKeys, SchemaRegistrySettingsUtil}
29+
import za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent.ConfluentAvroEncodingTransformer.{getKeyAvroConfig, getValueAvroConfig}
30+
import za.co.absa.hyperdrive.ingestor.implementation.utils.{AbrisConfigUtil, SchemaRegistryProducerConfigKeys}
2931
import za.co.absa.hyperdrive.ingestor.implementation.writer.kafka.KafkaStreamWriter.KEY_TOPIC
3032

3133
private[transformer] class ConfluentAvroEncodingTransformer(
32-
val valueSchemaRegistrySettings: Map[String, String],
33-
val keySchemaRegistrySettings: Option[Map[String, String]])
34+
val config: Configuration,
35+
val withKey: Boolean)
3436
extends StreamTransformer {
3537

36-
if (valueSchemaRegistrySettings.isEmpty) {
37-
throw new IllegalArgumentException(
38-
"Empty Schema Registry settings received.")
39-
}
40-
4138
private val logger = LogManager.getLogger
4239

4340
override def transform(dataFrame: DataFrame): DataFrame = {
44-
logger.info(s"SchemaRegistry settings: $valueSchemaRegistrySettings")
45-
46-
keySchemaRegistrySettings match {
47-
case Some(keySettings) => getKeyValueDataFrame(dataFrame, keySettings)
48-
case None => getValueDataFrame(dataFrame)
41+
if (withKey) {
42+
getKeyValueDataFrame(dataFrame)
43+
} else {
44+
getValueDataFrame(dataFrame)
4945
}
5046
}
5147

52-
private def getKeyValueDataFrame(dataFrame: DataFrame, keySchemaRegistrySettings: Map[String, String]): DataFrame = {
48+
private def getKeyValueDataFrame(dataFrame: DataFrame): DataFrame = {
5349
val keyColumnPrefix = HyperdriveContext.get[String](HyperdriveContextKeys.keyColumnPrefix).get
5450
val keyColumnNames = HyperdriveContext.get[Seq[String]](HyperdriveContextKeys.keyColumnNames).get
5551
val prefixedKeyColumnNames = keyColumnNames.map(c => s"$keyColumnPrefix$c")
@@ -58,60 +54,49 @@ private[transformer] class ConfluentAvroEncodingTransformer(
5854
.filterNot(columnName => prefixedKeyColumnNames.contains(columnName))
5955
.map(c => dataFrame(c))
6056
val unprefixedKeyColumns = keyColumnNames.map(c => dataFrame(s"$keyColumnPrefix$c").as(c))
61-
val unprefixedDataFrame = dataFrame.select(struct(unprefixedKeyColumns: _*) as 'key, struct(valueColumns: _*) as 'value)
62-
unprefixedDataFrame.select(
63-
to_confluent_avro(col("key"), keySchemaRegistrySettings) as 'key,
64-
to_confluent_avro(col("value"), valueSchemaRegistrySettings) as 'value)
57+
val keyStruct = struct(unprefixedKeyColumns: _*) as 'key
58+
val valueStruct = struct(valueColumns: _*) as 'value
59+
val keyToAvroConfig = getKeyAvroConfig(config, keyStruct.expr)
60+
val valueToAvroConfig = getValueAvroConfig(config, valueStruct.expr)
61+
logger.info(s"Key ToAvro settings: $keyToAvroConfig")
62+
logger.info(s"Value ToAvro settings: $valueToAvroConfig")
63+
dataFrame.select(
64+
to_avro(keyStruct, keyToAvroConfig) as 'key,
65+
to_avro(valueStruct, valueToAvroConfig) as 'value)
6566
}
6667

6768
private def getValueDataFrame(dataFrame: DataFrame): DataFrame = {
6869
val allColumns = struct(dataFrame.columns.head, dataFrame.columns.tail: _*)
69-
dataFrame.select(to_confluent_avro(allColumns, valueSchemaRegistrySettings) as 'value)
70+
val toAvroConfig = getValueAvroConfig(config, allColumns.expr)
71+
logger.info(s"ToAvro settings: $toAvroConfig")
72+
dataFrame.select(to_avro(allColumns, toAvroConfig) as 'value)
7073
}
7174
}
7275

7376
object ConfluentAvroEncodingTransformer extends StreamTransformerFactory with ConfluentAvroEncodingTransformerAttributes {
7477

75-
object ValueSchemaConfigKeys extends SchemaRegistryProducerConfigKeys {
78+
object SchemaConfigKeys extends SchemaRegistryProducerConfigKeys {
79+
override val topic: String = KEY_TOPIC
7680
override val schemaRegistryUrl: String = KEY_SCHEMA_REGISTRY_URL
7781
override val namingStrategy: String = KEY_SCHEMA_REGISTRY_VALUE_NAMING_STRATEGY
7882
override val recordName: String = KEY_SCHEMA_REGISTRY_VALUE_RECORD_NAME
7983
override val recordNamespace: String = KEY_SCHEMA_REGISTRY_VALUE_RECORD_NAMESPACE
80-
override val paramSchemaId: String = PARAM_VALUE_SCHEMA_ID
81-
override val paramSchemaNamingStrategy: String = PARAM_VALUE_SCHEMA_NAMING_STRATEGY
82-
override val paramSchemaNameForRecordStrategy: String = PARAM_VALUE_SCHEMA_NAME_FOR_RECORD_STRATEGY
83-
override val paramSchemaNamespaceForRecordStrategy: String = PARAM_VALUE_SCHEMA_NAMESPACE_FOR_RECORD_STRATEGY
84-
}
85-
86-
object KeySchemaConfigKeys extends SchemaRegistryProducerConfigKeys {
87-
override val schemaRegistryUrl: String = KEY_SCHEMA_REGISTRY_URL
88-
override val namingStrategy: String = KEY_SCHEMA_REGISTRY_KEY_NAMING_STRATEGY
89-
override val recordName: String = KEY_SCHEMA_REGISTRY_KEY_RECORD_NAME
90-
override val recordNamespace: String = KEY_SCHEMA_REGISTRY_KEY_RECORD_NAMESPACE
91-
override val paramSchemaId: String = PARAM_KEY_SCHEMA_ID
92-
override val paramSchemaNamingStrategy: String = PARAM_KEY_SCHEMA_NAMING_STRATEGY
93-
override val paramSchemaNameForRecordStrategy: String = PARAM_KEY_SCHEMA_NAME_FOR_RECORD_STRATEGY
94-
override val paramSchemaNamespaceForRecordStrategy: String = PARAM_KEY_SCHEMA_NAMESPACE_FOR_RECORD_STRATEGY
9584
}
9685

9786
override def apply(config: Configuration): StreamTransformer = {
98-
val topic = config.getString(KEY_TOPIC)
99-
100-
val valueSchemaRegistrySettings = SchemaRegistrySettingsUtil.getProducerSettings(config, topic, ValueSchemaConfigKeys)
101-
val produceKeys = ConfigUtils.getOptionalBoolean(KEY_PRODUCE_KEYS, config).getOrElse(false)
102-
val keySchemaRegistrySettingsOpt = if (produceKeys) {
103-
Some(SchemaRegistrySettingsUtil.getProducerSettings(config, topic, KeySchemaConfigKeys))
104-
} else {
105-
None
106-
}
107-
108-
new ConfluentAvroEncodingTransformer(valueSchemaRegistrySettings, keySchemaRegistrySettingsOpt)
87+
val withKey = ConfigUtils.getOptionalBoolean(KEY_PRODUCE_KEYS, config).getOrElse(false)
88+
new ConfluentAvroEncodingTransformer(config, withKey)
10989
}
11090

11191
override def getMappingFromRetainedGlobalConfigToLocalConfig(globalConfig: Configuration): Map[String, String] = Map(
11292
KEY_TOPIC -> KEY_TOPIC
11393
)
11494

95+
def getKeyAvroConfig(config: Configuration, expression: Expression): ToAvroConfig =
96+
AbrisConfigUtil.getKeyProducerSettings(config, SchemaConfigKeys, expression)
97+
98+
def getValueAvroConfig(config: Configuration, expression: Expression): ToAvroConfig =
99+
AbrisConfigUtil.getValueProducerSettings(config, SchemaConfigKeys, expression)
115100
}
116101

117102

0 commit comments

Comments
 (0)