Skip to content

Commit e3435ec

Browse files
committed
#809 Add test suites for reading compressed ASCII files.
1 parent c7ff7e8 commit e3435ec

File tree

2 files changed

+51
-6
lines changed

2 files changed

+51
-6
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ Among the motivations for this project, it is possible to highlight:
3939

4040
- The COBOL copybooks parser doesn't have a Spark dependency and can be reused for integrating into other data processing engines.
4141

42+
- Supports reading files compressed in Hadoop-compatible way (gzip, bzip2, etc), but with limited parallelism (only per-file parallelism).
43+
Uncompressed files are preferred for performance.
44+
4245
## Videos
4346

4447
We have presented Cobrix at DataWorks Summit 2019 and Spark Summit 2019 conferences. The screencasts are available here:

spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test40CompressesFilesSpec.scala

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package za.co.absa.cobrix.spark.cobol.source.integration
1818

1919
import org.scalatest.Assertion
2020
import org.scalatest.funsuite.AnyFunSuite
21-
import org.slf4j.{Logger, LoggerFactory}
2221
import za.co.absa.cobrix.cobol.parser.CopybookParser
2322
import za.co.absa.cobrix.cobol.parser.policies.DebugFieldsPolicy
2423
import za.co.absa.cobrix.spark.cobol.source.base.{SimpleComparisonBase, SparkTestBase}
@@ -30,8 +29,6 @@ import java.nio.file.{Files, Paths}
3029
import scala.collection.JavaConverters._
3130

3231
class Test40CompressesFilesSpec extends AnyFunSuite with SparkTestBase with BinaryFileFixture with SimpleComparisonBase {
33-
private implicit val logger: Logger = LoggerFactory.getLogger(this.getClass)
34-
3532
private val exampleName = "Test40 (compressed files)"
3633

3734
private val inputCopybookPath = "file://../data/test40_copybook.cob"
@@ -92,15 +89,41 @@ class Test40CompressesFilesSpec extends AnyFunSuite with SparkTestBase with Bina
9289
succeed
9390
}
9491

95-
test("Test gzip") {
92+
def testAsciiFile(options: Map[String, String]): Assertion = {
93+
val inputDataPath = "../data/test40_data_ascii/ascii.txt.gz"
94+
95+
val df = spark
96+
.read
97+
.format("cobol")
98+
.option("copybook_contents",
99+
"""
100+
| 01 RECORD.
101+
| 05 DATA PIC X(5).
102+
|""".stripMargin)
103+
.option("record_format", "D")
104+
.option("pedantic", "true")
105+
.options(options)
106+
.load(inputDataPath)
107+
108+
assert(df.count == 3)
109+
110+
val actual = df.orderBy("data")
111+
.collect()
112+
.map(a => a.getString(0))
113+
.mkString(",")
114+
115+
assert(actual == "12345,67890,A1234")
116+
}
117+
118+
test("Test compressed EBCDIC gzip file") {
96119
testCompressedFile("../data/test40_data/example.dat.gz")
97120
}
98121

99-
test("Test bzip2") {
122+
test("Test compressed EBCDIC bzip2 file") {
100123
testCompressedFile("../data/test40_data/example.dat.bz2")
101124
}
102125

103-
test("read mixed compressed files") {
126+
test("read mixed compressed EBCDIC files") {
104127
val inputDataPath = "../data/test40_data"
105128

106129
val df = spark
@@ -115,4 +138,23 @@ class Test40CompressesFilesSpec extends AnyFunSuite with SparkTestBase with Bina
115138

116139
assert(df.count == 300)
117140
}
141+
142+
test("read a compressed ASCII file 1") {
143+
testAsciiFile(Map(
144+
"record_format" -> "D"
145+
))
146+
}
147+
148+
test("read a compressed ASCII file 2") {
149+
testAsciiFile(Map(
150+
"record_format" -> "D",
151+
"ascii_charset" -> "ISO-8859-1"
152+
))
153+
}
154+
155+
test("read a compressed ASCII file 3") {
156+
testAsciiFile(Map(
157+
"record_format" -> "D2"
158+
))
159+
}
118160
}

0 commit comments

Comments
 (0)