Skip to content

Commit 11d810f

Browse files
authored
#203 Commit (#210)
* #203 Commit * Committing suggested changes
1 parent 1e42e8c commit 11d810f

File tree

4 files changed

+51
-12
lines changed

4 files changed

+51
-12
lines changed

ingestor-default/src/main/scala/za/co/absa/hyperdrive/ingestor/implementation/reader/kafka/KafkaStreamReader.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,8 @@ private[reader] class KafkaStreamReader(
9595
}
9696

9797
private def getStartingOffsets(checkpointLocation: String, configuration: org.apache.hadoop.conf.Configuration): Option[String] = {
98-
if (FileUtils.exists(checkpointLocation, configuration)) {
99-
Option.empty
98+
if (FileUtils.exists(checkpointLocation, configuration) && !FileUtils.isEmpty(checkpointLocation, configuration)) {
99+
Option.empty
100100
}
101101
else {
102102
Option(STARTING_OFFSETS_EARLIEST)

ingestor-default/src/test/scala/za/co/absa/hyperdrive/ingestor/implementation/reader/kafka/TestKafkaStreamReader.scala

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ import org.scalatest.{BeforeAndAfterEach, FlatSpec}
2525
import za.co.absa.commons.io.TempDirectory
2626
import za.co.absa.hyperdrive.ingestor.implementation.reader.kafka.KafkaStreamReaderProps._
2727

28+
import java.nio.file.{Files, Paths}
29+
2830
class TestKafkaStreamReader extends FlatSpec with BeforeAndAfterEach with MockitoSugar {
2931

3032
private val validTopic = "test-topic"
@@ -97,19 +99,43 @@ class TestKafkaStreamReader extends FlatSpec with BeforeAndAfterEach with Mockit
9799
verify(dataStreamReader, never()).options(validExtraConfs)
98100
}
99101

100-
it should "set offsets to earliest if no checkpoint location exists" in {
102+
it should "set offsets to earliest if checkpoint location does not exist" in {
101103
val sparkContext = getMockedSparkContext(stopped = false)
102104
val dataStreamReader = getMockedDataStreamReader
103105
val sparkSession = getConfiguredMockedSparkSession(sparkContext, dataStreamReader)
104-
105106
val nonExistent = tempDir.path.resolve("non-existent")
107+
106108
val reader = new KafkaStreamReader(validTopic, validBrokers, nonExistent.toUri.getPath, Map())
107109
reader.read(sparkSession)
108110

109111
verify(dataStreamReader).option(WORD_STARTING_OFFSETS, STARTING_OFFSETS_EARLIEST)
110112
}
111113

112-
it should "set offsets to user-defined property if no checkpoint location exists" in {
114+
it should "set offsets to earliest if checkpoint location is empty" in {
115+
val sparkContext = getMockedSparkContext(stopped = false)
116+
val dataStreamReader = getMockedDataStreamReader
117+
val sparkSession = getConfiguredMockedSparkSession(sparkContext, dataStreamReader)
118+
Files.createDirectories(Paths.get(s"$tempDirPath/empty1/empty2/empty3"))
119+
120+
val reader = new KafkaStreamReader(validTopic, validBrokers, tempDirPath, Map())
121+
reader.read(sparkSession)
122+
123+
verify(dataStreamReader).option(WORD_STARTING_OFFSETS, STARTING_OFFSETS_EARLIEST)
124+
}
125+
126+
it should "not set offsets to earliest if a checkpoint location exists and is not empty" in {
127+
val sparkContext = getMockedSparkContext(stopped = false)
128+
val dataStreamReader = getMockedDataStreamReader
129+
val sparkSession = getConfiguredMockedSparkSession(sparkContext, dataStreamReader)
130+
Files.createFile(Paths.get(s"$tempDirPath/anyFile"))
131+
132+
val reader = new KafkaStreamReader(validTopic, validBrokers, tempDirPath, Map())
133+
reader.read(sparkSession)
134+
135+
verify(dataStreamReader, never()).option(WORD_STARTING_OFFSETS, STARTING_OFFSETS_EARLIEST)
136+
}
137+
138+
it should "always set offsets to user-defined property e.g. if checkpoint location does not exist" in {
113139
val sparkContext = getMockedSparkContext(stopped = false)
114140
val dataStreamReader = getMockedDataStreamReader
115141
val sparkSession = getConfiguredMockedSparkSession(sparkContext, dataStreamReader)
@@ -121,15 +147,18 @@ class TestKafkaStreamReader extends FlatSpec with BeforeAndAfterEach with Mockit
121147
verify(dataStreamReader).options(Map(WORD_STARTING_OFFSETS -> "latest"))
122148
}
123149

124-
it should "not set offsets if a checkpoint location exists" in {
150+
it should "always set offsets to user-defined property e.g. if checkpoint location exists" in {
125151
val sparkContext = getMockedSparkContext(stopped = false)
126152
val dataStreamReader = getMockedDataStreamReader
127153
val sparkSession = getConfiguredMockedSparkSession(sparkContext, dataStreamReader)
128154

129-
val reader = new KafkaStreamReader(validTopic, validBrokers, tempDirPath, Map())
155+
Files.createFile(Paths.get(s"$tempDirPath/anyFile"))
156+
157+
val nonExistent = tempDir.path.resolve("non-existent")
158+
val reader = new KafkaStreamReader(validTopic, validBrokers, nonExistent.toUri.getPath, Map(WORD_STARTING_OFFSETS -> "latest"))
130159
reader.read(sparkSession)
131160

132-
verify(dataStreamReader, never()).option(WORD_STARTING_OFFSETS, STARTING_OFFSETS_EARLIEST)
161+
verify(dataStreamReader).options(Map(WORD_STARTING_OFFSETS -> "latest"))
133162
}
134163

135164
private def getMockedSparkContext(stopped: Boolean): SparkContext = {

shared/src/main/scala/za/co/absa/hyperdrive/shared/utils/FileUtils.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ private[hyperdrive] object FileUtils {
3737
def isEmpty(directory: String, configuration: Configuration): Boolean = {
3838
val fs = getFileSystem(configuration)
3939
val path = new Path(directory)
40-
fs.exists(path) && !fs.listLocatedStatus(path).hasNext
40+
fs.exists(path) && !fs.listFiles(path, true).hasNext
4141
}
4242

4343
private def getFileSystem(configuration: Configuration): FileSystem = {

shared/src/test/scala/za/co/absa/hyperdrive/shared/utils/TestFileUtils.scala

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616
package za.co.absa.hyperdrive.shared.utils
1717

1818
import java.util.UUID
19-
20-
import org.apache.hadoop.fs.{FileSystem, Path}
19+
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path, RemoteIterator}
2120
import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}
2221
import za.co.absa.commons.io.TempDirectory
2322
import za.co.absa.commons.spark.SparkTestBase
@@ -49,11 +48,22 @@ class TestFileUtils extends FlatSpec with Matchers with SparkTestBase with Befor
4948
FileUtils.isEmpty(directory, config) shouldBe true
5049
}
5150

51+
it should "return true if the directory only contains other directories, but no files" in {
52+
// given
53+
val directory = s"$baseDirPath/empty1"
54+
val subDirectory = s"$baseDirPath/empty1/empty2/empty3"
55+
fs.mkdirs(new Path(subDirectory))
56+
57+
// when, then
58+
FileUtils.isEmpty(directory, config) shouldBe true
59+
}
60+
5261
it should "return false if the directory is not empty" in {
5362
// given
5463
val directory = s"$baseDirPath/empty"
64+
val subDirectory = s"$baseDirPath/empty/empty2"
5565
fs.mkdirs(new Path(directory))
56-
fs.create(new Path(directory, "_INFO"))
66+
fs.create(new Path(subDirectory, "_INFO"))
5767

5868
// when, then
5969
FileUtils.isEmpty(directory, config) shouldBe false

0 commit comments

Comments
 (0)