[SPARK-28142][SS] Use CaseInsensitiveStringMap for KafkaContinuousStream

HeartSaVioR · HyukjinKwon · commit 85e95b7d27a3 · 2019-06-24T22:20:32.000+09:00
## What changes were proposed in this pull request? This patch addresses a missing spot which Map should be passed as CaseInsensitiveStringMap - KafkaContinuousStream seems to be only the missed one. Before this fix, it has a relevant bug where `pollTimeoutMs` is always set to default value, as the value of `KafkaSourceProvider.CONSUMER_POLL_TIMEOUT` is `kafkaConsumer.pollTimeoutMs` which key-lowercased map has been provided as `sourceOptions`. ## How was this patch tested? N/A. Closes apache#24942 from HeartSaVioR/MINOR-use-case-insensitive-map-for-kafka-continuous-source. Authored-by: Jungtaek Lim (HeartSaVioR) <kabhwan@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousStream.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousStream.scala
@@ -30,14 +30,15 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.kafka010.KafkaSourceProvider.{INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE, INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE}
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.reader.streaming._
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
 /**
  * A [[ContinuousStream]] for data from kafka.
  *
  * @param offsetReader  a reader used to get kafka offsets. Note that the actual data will be
  *                      read by per-task consumers generated later.
  * @param kafkaParams   String params for per-task Kafka consumers.
- * @param sourceOptions Params which are not Kafka consumer params.
+ * @param options Params which are not Kafka consumer params.
  * @param metadataPath Path to a directory this reader can use for writing metadata.
  * @param initialOffsets The Kafka offsets to start reading data at.
  * @param failOnDataLoss Flag indicating whether reading should fail in data loss
@@ -47,14 +48,14 @@ import org.apache.spark.sql.sources.v2.reader.streaming._
 class KafkaContinuousStream(
     offsetReader: KafkaOffsetReader,
     kafkaParams: ju.Map[String, Object],
-    sourceOptions: Map[String, String],
+    options: CaseInsensitiveStringMap,
     metadataPath: String,
     initialOffsets: KafkaOffsetRangeLimit,
     failOnDataLoss: Boolean)
   extends ContinuousStream with Logging {
 
   private val pollTimeoutMs =
-    sourceOptions.getOrElse(KafkaSourceProvider.CONSUMER_POLL_TIMEOUT, "512").toLong
+    options.getLong(KafkaSourceProvider.CONSUMER_POLL_TIMEOUT, 512)
 
   // Initialized when creating reader factories. If this diverges from the partitions at the latest
   // offsets, we need to reconfigure.
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -449,7 +449,7 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
       new KafkaContinuousStream(
         kafkaOffsetReader,
         kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
-        parameters,
+        options,
         checkpointLocation,
         startingStreamOffsets,
         failOnDataLoss(caseInsensitiveParams))