diff --git a/README.md b/README.md
index b77e64530..bf1df3226 100644
--- a/README.md
+++ b/README.md
@@ -1532,6 +1532,7 @@ The output looks like this:
| Option (usage example) | Description |
|-----------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| .option("string_trimming_policy", "both") | Specifies if and how string fields should be trimmed. Available options: `both` (default), `none`, `left`, `right`, `keep_all`. `keep_all` - keeps control characters when decoding ASCII text files |
+| .option("display_pic_always_string", "false") | If `true` fields that have `DISPLAY` format will always be converted to `string` type, even if such fields contain numbers, retaining leading and trailing zeros. Cannot be used together with `strict_integral_precision`. |
| .option("ebcdic_code_page", "common") | Specifies a code page for EBCDIC encoding. Currently supported values: `common` (default), `common_extended`, `cp037`, `cp037_extended`, and others (see "Currently supported EBCDIC code pages" section. |
| .option("ebcdic_code_page_class", "full.class.specifier") | Specifies a user provided class for a custom code page to UNICODE conversion. |
| .option("field_code_page:cp825", "field1, field2") | Specifies the code page for selected fields. You can add mo than 1 such option for multiple code page overrides. |
@@ -1541,7 +1542,7 @@ The output looks like this:
| .option("occurs_mapping", "{\"FIELD\": {\"X\": 1}}") | If specified, as a JSON string, allows for String `DEPENDING ON` fields with a corresponding mapping. |
| .option("strict_sign_overpunching", "true") | If `true` (default), sign overpunching will only be allowed for signed numbers. If `false`, overpunched positive sign will be allowed for unsigned numbers, but negative sign will result in null. |
| .option("improved_null_detection", "true") | If `true`(default), values that contain only 0x0 ror DISPLAY strings and numbers will be considered `null`s instead of empty strings. |
-| .option("strict_integral_precision", "true") | If `true`, Cobrix will not generate `short`/`integer`/`long` Spark data types, and always use `decimal(n)` with the exact precision that matches the copybook. |
+| .option("strict_integral_precision", "true") | If `true`, Cobrix will not generate `short`/`integer`/`long` Spark data types, and always use `decimal(n)` with the exact precision that matches the copybook. Cannot be used together with `display_pic_always_string`. |
| .option("binary_as_hex", "false") | By default fields that have `PIC X` and `USAGE COMP` are converted to `binary` Spark data type. If this option is set to `true`, such fields will be strings in HEX encoding. |
##### Modifier options
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala
index 1e002457f..fb352a1e0 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala
@@ -107,24 +107,25 @@ object CopybookParser extends Logging {
* Tokenizes a Cobol Copybook contents and returns the AST.
*
* @param dataEncoding Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
- * @param copyBookContents A string containing all lines of a copybook
- * @param dropGroupFillers Drop groups marked as fillers from the output AST
- * @param dropValueFillers Drop primitive fields marked as fillers from the output AST
- * @param fillerNamingPolicy Specifies a naming policy for fillers
+ * @param copyBookContents A string containing all lines of a copybook.
+ * @param dropGroupFillers Drop groups marked as fillers from the output AST.
+ * @param dropValueFillers Drop primitive fields marked as fillers from the output AST.
+ * @param fillerNamingPolicy Specifies a naming policy for fillers.
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* resolving segment redefines.
- * @param fieldParentMap A segment fields parent mapping
- * @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
- * @param strictSignOverpunch If true sign overpunching is not allowed for unsigned numbers
+ * @param fieldParentMap A segment fields parent mapping.
+ * @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed.
+ * @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers.
+ * @param strictSignOverpunch If true sign overpunching is not allowed for unsigned numbers.
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
- * @param commentPolicy Specifies a policy for comments truncation inside a copybook
- * @param ebcdicCodePage A code page for EBCDIC encoded data
- * @param asciiCharset A charset for ASCII encoded data
+ * @param commentPolicy Specifies a policy for comments truncation inside a copybook.
+ * @param ebcdicCodePage A code page for EBCDIC encoded data.
+ * @param asciiCharset A charset for ASCII encoded data.
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
- * @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
- * @param nonTerminals A list of non-terminals that should be extracted as strings
+ * @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754).
+ * @param nonTerminals A list of non-terminals that should be extracted as strings.
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
- * @return Seq[Group] where a group is a record inside the copybook
+ * @return Seq[Group] where a group is a record inside the copybook.
*/
def parse(copyBookContents: String,
dataEncoding: Encoding = EBCDIC,
@@ -134,6 +135,7 @@ object CopybookParser extends Logging {
segmentRedefines: Seq[String] = Nil,
fieldParentMap: Map[String, String] = HashMap[String, String](),
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
+ isDisplayAlwaysString: Boolean = false,
commentPolicy: CommentPolicy = CommentPolicy(),
strictSignOverpunch: Boolean = true,
improvedNullDetection: Boolean = false,
@@ -155,6 +157,7 @@ object CopybookParser extends Logging {
segmentRedefines,
fieldParentMap,
stringTrimmingPolicy,
+ isDisplayAlwaysString,
commentPolicy,
strictSignOverpunch,
improvedNullDetection,
@@ -180,6 +183,7 @@ object CopybookParser extends Logging {
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
+ * @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param strictSignOverpunch If true sign overpunching is not allowed for unsigned numbers
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
@@ -198,6 +202,7 @@ object CopybookParser extends Logging {
segmentRedefines: Seq[String] = Nil,
fieldParentMap: Map[String, String] = HashMap[String, String](),
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
+ isDisplayAlwaysString: Boolean = false,
commentPolicy: CommentPolicy = CommentPolicy(),
strictSignOverpunch: Boolean = true,
improvedNullDetection: Boolean = false,
@@ -219,6 +224,7 @@ object CopybookParser extends Logging {
segmentRedefines,
fieldParentMap,
stringTrimmingPolicy,
+ isDisplayAlwaysString,
commentPolicy,
strictSignOverpunch,
improvedNullDetection,
@@ -265,6 +271,7 @@ object CopybookParser extends Logging {
segmentRedefines: Seq[String],
fieldParentMap: Map[String, String],
stringTrimmingPolicy: StringTrimmingPolicy,
+ isDisplayAlwaysString: Boolean,
commentPolicy: CommentPolicy,
strictSignOverpunch: Boolean,
improvedNullDetection: Boolean,
@@ -279,7 +286,7 @@ object CopybookParser extends Logging {
debugFieldsPolicy: DebugFieldsPolicy,
fieldCodePageMap: Map[String, String]): Copybook = {
- val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap)
+ val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, isDisplayAlwaysString, commentPolicy, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap)
val nonTerms: Set[String] = (for (id <- nonTerminals)
yield transformIdentifier(id)
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala
index 670f4ec5a..9bafa234f 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala
@@ -54,6 +54,7 @@ object ANTLRParser extends Logging {
def parse(copyBookContents: String,
enc: Encoding,
stringTrimmingPolicy: StringTrimmingPolicy,
+ isDisplayAlwaysString: Boolean,
commentPolicy: CommentPolicy,
strictSignOverpunch: Boolean,
improvedNullDetection: Boolean,
@@ -64,7 +65,7 @@ object ANTLRParser extends Logging {
isUtf16BigEndian: Boolean,
floatingPointFormat: FloatingPointFormat,
fieldCodePageMap: Map[String, String]): CopybookAST = {
- val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, fieldCodePageMap)
+ val visitor = new ParserVisitor(enc, stringTrimmingPolicy, isDisplayAlwaysString, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, fieldCodePageMap)
val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map(
line =>
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala
index 20908ad08..5e5ed87ff 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala
@@ -41,6 +41,7 @@ sealed trait Expr
class ParserVisitor(enc: Encoding,
stringTrimmingPolicy: StringTrimmingPolicy,
+ isDisplayAlwaysString: Boolean,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
@@ -854,7 +855,7 @@ class ParserVisitor(enc: Encoding,
Map(),
isDependee = false,
identifier.toUpperCase() == Constants.FILLER,
- DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision)
+ DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, isDisplayAlwaysString, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian = isUtf16BigEndian, floatingPointFormat, strictSignOverpunch = strictSignOverpunch, improvedNullDetection = improvedNullDetection, strictIntegralPrecision = strictIntegralPrecision)
) (Some(parent))
parent.children.append(prim)
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala
index b7d91d2f3..bb652df28 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala
@@ -73,7 +73,7 @@ class NonTerminalsAdder(
)
val sz = g.binaryProperties.actualSize
val dataType = AlphaNumeric(s"X($sz)", sz, enc = Some(enc))
- val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
+ val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, isDisplayAlwaysString = false, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
val newName = getNonTerminalName(g.name, g.parent.get)
newChildren.append(
Primitive(
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala
index 462fbf03f..791d6ef9f 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala
@@ -56,6 +56,7 @@ object DecoderSelector {
*/
def getDecoder(dataType: CobolType,
stringTrimmingPolicy: StringTrimmingPolicy = TrimBoth,
+ isDisplayAlwaysString: Boolean = false,
ebcdicCodePage: CodePage = new CodePageCommon,
asciiCharset: Charset = StandardCharsets.US_ASCII,
isUtf16BigEndian: Boolean = true,
@@ -66,6 +67,7 @@ object DecoderSelector {
val decoder = dataType match {
case alphaNumeric: AlphaNumeric => getStringDecoder(alphaNumeric.enc.getOrElse(EBCDIC), stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, improvedNullDetection)
case decimalType: Decimal => getDecimalDecoder(decimalType, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
+ case integralType: Integral if isDisplayAlwaysString => getDisplayDecoderAsString(integralType, improvedNullDetection, strictSignOverpunch)
case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision)
case _ => throw new IllegalStateException("Unknown AST object")
}
@@ -251,6 +253,29 @@ object DecoderSelector {
}
}
+ private[parser] def getDisplayDecoderAsString(integralType: Integral,
+ improvedNullDetection: Boolean,
+ strictSignOverpunch: Boolean): Decoder = {
+ val encoding = integralType.enc.getOrElse(EBCDIC)
+ val isSigned = integralType.signPosition.isDefined
+ val allowedSignOverpunch = isSigned || !strictSignOverpunch
+
+ val isEbcdic = encoding match {
+ case EBCDIC => true
+ case _ => false
+ }
+
+ if (isEbcdic) {
+ bytes: Array[Byte] => {
+ StringDecoders.decodeEbcdicNumber(bytes, !isSigned, allowedSignOverpunch, improvedNullDetection)
+ }
+ } else {
+ bytes: Array[Byte] => {
+ StringDecoders.decodeAsciiNumber(bytes, !isSigned, allowedSignOverpunch, improvedNullDetection)
+ }
+ }
+ }
+
/** Gets a decoder function for a binary encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */
private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean, strictIntegralPrecision: Boolean): Decoder = {
val isSigned = signPosition.nonEmpty
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala
index 8b1a21ee2..285a93087 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala
@@ -49,6 +49,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
* @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record
* @param schemaRetentionPolicy A copybook usually has a root group struct element that acts like a rowtag in XML. This can be retained in Spark schema or can be collapsed
* @param stringTrimmingPolicy Specify if and how strings should be trimmed when parsed
+ * @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers
* @param allowPartialRecords If true, partial ASCII records can be parsed (in cases when LF character is missing for example)
* @param multisegmentParams Parameters for reading multisegment mainframe files
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
@@ -87,6 +88,7 @@ case class CobolParameters(
generateRecordBytes: Boolean,
schemaRetentionPolicy: SchemaRetentionPolicy,
stringTrimmingPolicy: StringTrimmingPolicy,
+ isDisplayAlwaysString: Boolean,
allowPartialRecords: Boolean,
multisegmentParams: Option[MultisegmentParameters],
commentPolicy: CommentPolicy,
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
index bcd9c11b3..c1daa3a44 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
@@ -59,6 +59,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
* @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record
* @param schemaPolicy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed.
+ * @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers.
* @param allowPartialRecords If true, partial ASCII records can be parsed (in cases when LF character is missing for example)
* @param multisegment Parameters specific to reading multisegment files
* @param commentPolicy A comment truncation policy
@@ -108,6 +109,7 @@ case class ReaderParameters(
generateRecordBytes: Boolean = false,
schemaPolicy: SchemaRetentionPolicy = SchemaRetentionPolicy.CollapseRoot,
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
+ isDisplayAlwaysString: Boolean = false,
allowPartialRecords: Boolean = false,
multisegment: Option[MultisegmentParameters] = None,
commentPolicy: CommentPolicy = CommentPolicy(),
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
index f45f67a7a..c3f0f374c 100644
--- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
+++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala
@@ -36,6 +36,7 @@ import scala.collection.immutable.HashMap
*
* @param copybook A parsed copybook.
* @param policy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
+ * @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers
* @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
* @param generateRecordId If true, a record id field will be prepended to the beginning of the schema.
* @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema.
@@ -46,6 +47,7 @@ import scala.collection.immutable.HashMap
*/
class CobolSchema(val copybook: Copybook,
val policy: SchemaRetentionPolicy,
+ val isDisplayAlwaysString: Boolean,
val strictIntegralPrecision: Boolean,
val inputFileNameField: String,
val generateRecordId: Boolean,
@@ -93,6 +95,7 @@ object CobolSchema {
segmentRedefines,
fieldParentMap,
readerParameters.stringTrimmingPolicy,
+ readerParameters.isDisplayAlwaysString,
readerParameters.commentPolicy,
readerParameters.strictSignOverpunch,
readerParameters.improvedNullDetection,
@@ -116,6 +119,7 @@ object CobolSchema {
segmentRedefines,
fieldParentMap,
readerParameters.stringTrimmingPolicy,
+ readerParameters.isDisplayAlwaysString,
readerParameters.commentPolicy,
readerParameters.strictSignOverpunch,
readerParameters.improvedNullDetection,
@@ -132,7 +136,17 @@ object CobolSchema {
))
val segIdFieldCount = readerParameters.multisegment.map(p => p.segmentLevelIds.size).getOrElse(0)
val segmentIdPrefix = readerParameters.multisegment.map(p => p.segmentIdPrefix).getOrElse("")
- new CobolSchema(schema, readerParameters.schemaPolicy, readerParameters.strictIntegralPrecision, readerParameters.inputFileNameColumn, readerParameters.generateRecordId, readerParameters.generateRecordBytes, segIdFieldCount, segmentIdPrefix, readerParameters.metadataPolicy)
+ new CobolSchema(schema,
+ readerParameters.schemaPolicy,
+ readerParameters.isDisplayAlwaysString,
+ readerParameters.strictIntegralPrecision,
+ readerParameters.inputFileNameColumn,
+ readerParameters.generateRecordId,
+ readerParameters.generateRecordBytes,
+ segIdFieldCount,
+ segmentIdPrefix,
+ readerParameters.metadataPolicy
+ )
}
def getCodePage(codePageName: String, codePageClass: Option[String]): CodePage = {
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala
index 8c6cd3a91..cbc25d306 100644
--- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala
+++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala
@@ -33,7 +33,9 @@ class DataSizeSpec extends AnyFunSuite {
private val logger: Logger = LoggerFactory.getLogger(this.getClass)
private def parse(pic: String): Primitive = {
- val visitor = new ParserVisitor(ASCII, StringTrimmingPolicy.TrimNone,
+ val visitor = new ParserVisitor(ASCII,
+ StringTrimmingPolicy.TrimNone,
+ isDisplayAlwaysString = false,
CodePage.getCodePageByName("common"),
StandardCharsets.US_ASCII,
isUtf16BigEndian = true,
diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala
index 020b00ff7..e5004c118 100644
--- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala
+++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala
@@ -33,7 +33,9 @@ class PicValidationSpec extends AnyFunSuite {
private def validatePic(pic: String) = {
- val visitor = new ParserVisitor(ASCII, StringTrimmingPolicy.TrimNone,
+ val visitor = new ParserVisitor(ASCII,
+ StringTrimmingPolicy.TrimNone,
+ isDisplayAlwaysString = false,
CodePage.getCodePageByName("common"),
StandardCharsets.UTF_8,
isUtf16BigEndian = true,
diff --git a/pom.xml b/pom.xml
index a3b3a45c7..e265e02fa 100644
--- a/pom.xml
+++ b/pom.xml
@@ -110,7 +110,7 @@
2.12.20
2.12
- 3.4.4
+ 3.5.2
3.2.14
2.4.16
15.0
diff --git a/spark-cobol/pom.xml b/spark-cobol/pom.xml
index 960285ec6..dff3714a3 100644
--- a/spark-cobol/pom.xml
+++ b/spark-cobol/pom.xml
@@ -57,6 +57,12 @@
cobol-parser_${scala.compat.version}
${project.version}
+
+
+ org.slf4j
+ slf4j-log4j12
+ test
+
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala
index d1034f2f1..91cb0c4e7 100644
--- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala
+++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala
@@ -72,6 +72,7 @@ object CobolParametersParser extends Logging {
val PARAM_VALUE_FILLERS = "drop_value_fillers"
val PARAM_FILLER_NAMING_POLICY = "filler_naming_policy"
val PARAM_STRICT_INTEGRAL_PRECISION = "strict_integral_precision"
+ val PARAM_DISPLAY_PIC_ALWAYS_STRING = "display_pic_always_string"
val PARAM_GROUP_NOT_TERMINALS = "non_terminals"
val PARAM_OCCURS_MAPPINGS = "occurs_mappings"
@@ -270,6 +271,7 @@ object CobolParametersParser extends Logging {
params.getOrElse(PARAM_GENERATE_RECORD_BYTES, "false").toBoolean,
schemaRetentionPolicy,
stringTrimmingPolicy,
+ params.getOrElse(PARAM_DISPLAY_PIC_ALWAYS_STRING, "false").toBoolean,
params.getOrElse(PARAM_ALLOW_PARTIAL_RECORDS, "false").toBoolean,
parseMultisegmentParameters(params),
parseCommentTruncationPolicy(params),
@@ -409,6 +411,7 @@ object CobolParametersParser extends Logging {
generateRecordBytes = parameters.generateRecordBytes,
schemaPolicy = parameters.schemaRetentionPolicy,
stringTrimmingPolicy = parameters.stringTrimmingPolicy,
+ isDisplayAlwaysString = parameters.isDisplayAlwaysString,
allowPartialRecords = parameters.allowPartialRecords,
parameters.multisegmentParams,
parameters.commentPolicy,
@@ -922,6 +925,10 @@ object CobolParametersParser extends Logging {
throw new IllegalArgumentException(s"'$PARAM_MINIMUM_RECORD_LENGTH' ($min) should be >= '$PARAM_MAXIMUM_RECORD_LENGTH' ($max).")
}
+ if (params.contains(PARAM_DISPLAY_PIC_ALWAYS_STRING) && params(PARAM_DISPLAY_PIC_ALWAYS_STRING).toBoolean &&
+ params.contains(PARAM_STRICT_INTEGRAL_PRECISION) && params(PARAM_STRICT_INTEGRAL_PRECISION).toBoolean)
+ throw new IllegalArgumentException(s"Options '$PARAM_DISPLAY_PIC_ALWAYS_STRING' and '$PARAM_STRICT_INTEGRAL_PRECISION' cannot be used together.")
+
if (unusedKeys.nonEmpty) {
val unusedKeyStr = unusedKeys.mkString(",")
val msg = s"Redundant or unrecognized option(s) to 'spark-cobol': $unusedKeyStr."
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
index 2b8a7098e..c67a55071 100644
--- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
+++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala
@@ -51,6 +51,7 @@ import scala.collection.mutable.ArrayBuffer
*/
class CobolSchema(copybook: Copybook,
schemaRetentionPolicy: SchemaRetentionPolicy,
+ isDisplayAlwaysString: Boolean = false,
strictIntegralPrecision: Boolean = false,
inputFileNameField: String = "",
generateRecordId: Boolean = false,
@@ -58,10 +59,16 @@ class CobolSchema(copybook: Copybook,
generateSegIdFieldsCnt: Int = 0,
segmentIdProvidedPrefix: String = "",
metadataPolicy: MetadataPolicy = MetadataPolicy.Basic)
- extends CobolReaderSchema(
- copybook, schemaRetentionPolicy, strictIntegralPrecision, inputFileNameField, generateRecordId, generateRecordBytes,
- generateSegIdFieldsCnt, segmentIdProvidedPrefix
- ) with Logging with Serializable {
+ extends CobolReaderSchema(copybook,
+ schemaRetentionPolicy,
+ isDisplayAlwaysString,
+ strictIntegralPrecision,
+ inputFileNameField,
+ generateRecordId,
+ generateRecordBytes,
+ generateSegIdFieldsCnt,
+ segmentIdProvidedPrefix
+ ) with Logging with Serializable {
@throws(classOf[IllegalStateException])
private[this] lazy val sparkSchema = createSparkSchema()
@@ -184,6 +191,10 @@ class CobolSchema(copybook: Copybook,
case Some(RAW) => BinaryType
case _ => StringType
}
+ case dt: Integral if isDisplayAlwaysString =>
+ if (metadataPolicy != MetadataPolicy.NoMetadata)
+ addIntegralStringMetadata(metadata, dt)
+ StringType
case dt: Integral if strictIntegralPrecision =>
DecimalType(precision = dt.precision, scale = 0)
case dt: Integral =>
@@ -223,6 +234,11 @@ class CobolSchema(copybook: Copybook,
metadataBuilder.putLong(MAX_LENGTH, a.length)
}
+ private def addIntegralStringMetadata(metadataBuilder: MetadataBuilder, i: Integral): MetadataBuilder = {
+ val maxLength = if (i.signPosition.isDefined) i.precision + 1 else i.precision
+ metadataBuilder.putLong(MAX_LENGTH, maxLength)
+ }
+
private def addExtendedMetadata(metadataBuilder: MetadataBuilder, s: Statement): MetadataBuilder = {
metadataBuilder.putLong("level", s.level)
if (s.originalName.nonEmpty && s.originalName != s.name)
@@ -288,6 +304,7 @@ object CobolSchema {
new CobolSchema(
schema.copybook,
schema.policy,
+ schema.isDisplayAlwaysString,
schema.strictIntegralPrecision,
schema.inputFileNameField,
schema.generateRecordId,
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala
index 79822d204..27d8c44e8 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala
@@ -102,6 +102,6 @@ class CobolSchemaHierarchicalSpec extends AnyWordSpec {
private def parseSchema(copybook: String, segmentRedefines: List[String], fieldParentMap: Map[String, String]): CobolSchema = {
val parsedSchema = CopybookParser.parseTree(copybook, segmentRedefines = segmentRedefines, fieldParentMap = fieldParentMap)
- new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "",false, false)
+ new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "",false, false)
}
}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala
index 8d3c20340..b25117b4b 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala
@@ -55,7 +55,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -73,7 +73,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, true, "", false, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, true, "", false, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -94,7 +94,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", true, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -113,7 +113,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, true)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, true)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -135,7 +135,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, true)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", true, true)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -163,7 +163,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", true, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -179,7 +179,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", false, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -196,7 +196,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", true, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -211,7 +211,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, true)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, true)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -229,7 +229,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, true)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", true, true)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -243,7 +243,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, false)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -272,7 +272,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| | |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, false, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", true, false, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -290,7 +290,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| | |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, true, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", false, true, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -311,7 +311,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| | |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, true, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", true, true, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -332,7 +332,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| | |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, true, "", true, true, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, true, "", true, true, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -349,7 +349,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| | |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", false, false, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -367,7 +367,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", true, false, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -382,7 +382,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
| |-- STR_FLD: string (nullable = true)
|""".stripMargin.replaceAll("[\\r\\n]", "\n")
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false, 2)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, false, 2)
val actualSchema = cobolSchema.getSparkSchema.treeString
assertEqualsMultiline(actualSchema, expectedSchema)
@@ -401,7 +401,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false)
+ val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", false, false)
val actualSparkSchema = cobolSchema1.getSparkSchema
val rootField = actualSparkSchema.fields.head.dataType.asInstanceOf[StructType]
@@ -430,7 +430,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
val parsedSchema = CopybookParser.parseTree(copyBook)
- val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false)
+ val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, false)
val actualSparkSchema = cobolSchema1.getSparkSchema
val metadataStr1 = actualSparkSchema.fields.head.metadata
@@ -447,6 +447,29 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase {
assert(metadataStr2.getLong(MAX_LENGTH) == 7)
}
+ "String display types" in {
+ val copyBook: String =
+ """ 01 RECORD.
+ | 05 STR1 PIC 9(10).
+ | 05 STR2 PIC S9(7).
+ | 05 NUM3 PIC 9V99(7).
+ |""".stripMargin
+
+ val expectedSchema =
+ """root
+ | |-- Seg_Id0: string (nullable = true)
+ | |-- Seg_Id1: string (nullable = true)
+ | |-- STR1: string (nullable = true)
+ | |-- STR2: string (nullable = true)
+ | |-- NUM3: decimal(9,8) (nullable = true)""".stripMargin
+
+ val parsedSchema = CopybookParser.parseTree(copyBook)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, true, false, "", false, false, 2)
+ val actualSchema = cobolSchema.getSparkSchema.treeString
+
+ assertEqualsMultiline(actualSchema, expectedSchema)
+ }
+
"fromSparkOptions" should {
"return a schema for a copybook" in {
val copybook: String =
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/Test01DisplayPicAsStrings.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/Test01DisplayPicAsStrings.scala
new file mode 100644
index 000000000..da247f6a7
--- /dev/null
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/Test01DisplayPicAsStrings.scala
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.spark.cobol.source
+
+import org.scalatest.funsuite.AnyFunSuite
+import org.slf4j.{Logger, LoggerFactory}
+import za.co.absa.cobrix.spark.cobol.parameters.MetadataFields.MAX_LENGTH
+import za.co.absa.cobrix.spark.cobol.source.base.{SimpleComparisonBase, SparkTestBase}
+import za.co.absa.cobrix.spark.cobol.source.fixtures.BinaryFileFixture
+import za.co.absa.cobrix.spark.cobol.utils.SparkUtils
+
+class Test01DisplayPicAsStrings extends AnyFunSuite with SparkTestBase with BinaryFileFixture with SimpleComparisonBase {
+
+ private implicit val logger: Logger = LoggerFactory.getLogger(this.getClass)
+
+ private val copybook =
+ """ 01 R.
+ 03 N1 PIC 9(4).
+ 03 N2 PIC S9(4).
+ 03 D1 PIC 99V99.
+ 03 D2 PIC 99.99.
+ """
+
+ val binFileContents: Array[Byte] = "0001 2{011012.342010K111002200.01 300001J 1 .02".getBytes()
+
+ test("Test a numeric fields having DISPLAY format are parsed as numbers") {
+ withTempBinFile("num_display1", ".dat", binFileContents) { tmpFileName =>
+ val df = spark
+ .read
+ .format("cobol")
+ .option("copybook_contents", copybook)
+ .option("pedantic", "true")
+ .option("encoding", "ascii")
+ .load(tmpFileName)
+
+ val expectedSchema =
+ """root
+ | |-- N1: integer (nullable = true)
+ | |-- N2: integer (nullable = true)
+ | |-- D1: decimal(4,2) (nullable = true)
+ | |-- D2: decimal(4,2) (nullable = true)
+ |""".stripMargin.replace("\r\n", "\n")
+
+ val expectedData =
+ """[ {
+ | "N1" : 1,
+ | "N2" : 20,
+ | "D1" : 1.1,
+ | "D2" : 12.34
+ |}, {
+ | "N1" : 2010,
+ | "N2" : -2111,
+ | "D1" : 0.22,
+ | "D2" : 0.01
+ |}, {
+ | "N1" : 300,
+ | "N2" : -11,
+ | "D1" : 0.01,
+ | "D2" : 0.02
+ |} ]""".stripMargin.replace("\r\n", "\n")
+
+ val actualSchema = df.schema.treeString
+ val actualData = SparkUtils.prettyJSON(df.toJSON.collect().mkString("[", ",", "]"))
+
+ assertEqualsMultiline(actualSchema, expectedSchema)
+ assertEqualsMultiline(actualData, expectedData)
+ }
+ }
+
+ test("Test a numeric fields having DISPLAY format are parsed as strings") {
+ withTempBinFile("num_display2", ".dat", binFileContents) { tmpFileName =>
+ val df = spark
+ .read
+ .format("cobol")
+ .option("copybook_contents", copybook)
+ .option("encoding", "ascii")
+ .option("display_pic_always_string", "true")
+ .option("pedantic", "true")
+ .load(tmpFileName)
+
+ val expectedSchema =
+ """root
+ | |-- N1: string (nullable = true)
+ | |-- N2: string (nullable = true)
+ | |-- D1: decimal(4,2) (nullable = true)
+ | |-- D2: decimal(4,2) (nullable = true)
+ |""".stripMargin.replace("\r\n", "\n")
+
+ val expectedData =
+ """[ {
+ | "N1" : "0001",
+ | "N2" : "+20",
+ | "D1" : 1.1,
+ | "D2" : 12.34
+ |}, {
+ | "N1" : "2010",
+ | "N2" : "-2111",
+ | "D1" : 0.22,
+ | "D2" : 0.01
+ |}, {
+ | "N1" : "300",
+ | "N2" : "-0011",
+ | "D1" : 0.01,
+ | "D2" : 0.02
+ |} ]""".stripMargin.replace("\r\n", "\n")
+
+ val actualSchema = df.schema.treeString
+ val actualData = SparkUtils.prettyJSON(df.toJSON.collect().mkString("[", ",", "]"))
+
+ assert(df.schema.fields.head.metadata.getLong(MAX_LENGTH) == 4)
+ assert(df.schema.fields(1).metadata.getLong(MAX_LENGTH) == 5)
+
+ assertEqualsMultiline(actualSchema, expectedSchema)
+ assertEqualsMultiline(actualData, expectedData)
+ }
+ }
+
+ test("Test incompatible options used together") {
+ withTempBinFile("num_display2", ".dat", binFileContents) { tmpFileName =>
+ val ex = intercept[IllegalArgumentException] {
+ spark
+ .read
+ .format("cobol")
+ .option("copybook_contents", copybook)
+ .option("encoding", "ascii")
+ .option("strict_integral_precision", "true")
+ .option("display_pic_always_string", "true")
+ .option("pedantic", "true")
+ .load(tmpFileName)
+ }
+
+ assert(ex.getMessage == "Options 'display_pic_always_string' and 'strict_integral_precision' cannot be used together.")
+ }
+ }
+}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala
index a4604bf91..3731efa8a 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala
@@ -37,7 +37,7 @@ class SparkSchemaSpec extends AnyFunSuite {
val parsedSchema = CopybookParser.parseTree(copyBookContents)
- val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "",false, false)
+ val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "",false, false)
val sparkSchema = cobolSchema.getSparkSchema
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala
index 2a9a7cc86..77fb982b1 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala
@@ -24,7 +24,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy
import scala.collection.Seq
-class DummyCobolSchema(val sparkSchema: StructType) extends CobolSchema(new Copybook(Group.root), SchemaRetentionPolicy.KeepOriginal, false, "", false, false) with Serializable {
+class DummyCobolSchema(val sparkSchema: StructType) extends CobolSchema(new Copybook(Group.root), SchemaRetentionPolicy.KeepOriginal, false, false, "", false, false) with Serializable {
override def getSparkSchema: StructType = sparkSchema
override lazy val getRecordSize: Int = 40
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala
index 7c7712ae6..746a41c19 100644
--- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala
+++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala
@@ -52,7 +52,7 @@ class Test02TextFilesOldSchool extends AnyFunSuite with SparkTestBase with Binar
withTempTextFile("text_ascii", ".txt", StandardCharsets.UTF_8, textFileContent) { tmpFileName =>
val parsedCopybook = CopybookParser.parse(copybook, dataEncoding = ASCII, stringTrimmingPolicy = StringTrimmingPolicy.TrimNone)
- val cobolSchema = new CobolSchema(parsedCopybook, SchemaRetentionPolicy.CollapseRoot, false, "", false)
+ val cobolSchema = new CobolSchema(parsedCopybook, SchemaRetentionPolicy.CollapseRoot, false, false, "", false)
val sparkSchema = cobolSchema.getSparkSchema
val rddText = spark.sparkContext