diff --git a/README.md b/README.md index b77e64530..bf1df3226 100644 --- a/README.md +++ b/README.md @@ -1532,6 +1532,7 @@ The output looks like this: | Option (usage example) | Description | |-----------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | .option("string_trimming_policy", "both") | Specifies if and how string fields should be trimmed. Available options: `both` (default), `none`, `left`, `right`, `keep_all`. `keep_all` - keeps control characters when decoding ASCII text files | +| .option("display_pic_always_string", "false") | If `true` fields that have `DISPLAY` format will always be converted to `string` type, even if such fields contain numbers, retaining leading and trailing zeros. Cannot be used together with `strict_integral_precision`. | | .option("ebcdic_code_page", "common") | Specifies a code page for EBCDIC encoding. Currently supported values: `common` (default), `common_extended`, `cp037`, `cp037_extended`, and others (see "Currently supported EBCDIC code pages" section. | | .option("ebcdic_code_page_class", "full.class.specifier") | Specifies a user provided class for a custom code page to UNICODE conversion. | | .option("field_code_page:cp825", "field1, field2") | Specifies the code page for selected fields. You can add mo than 1 such option for multiple code page overrides. | @@ -1541,7 +1542,7 @@ The output looks like this: | .option("occurs_mapping", "{\"FIELD\": {\"X\": 1}}") | If specified, as a JSON string, allows for String `DEPENDING ON` fields with a corresponding mapping. | | .option("strict_sign_overpunching", "true") | If `true` (default), sign overpunching will only be allowed for signed numbers. If `false`, overpunched positive sign will be allowed for unsigned numbers, but negative sign will result in null. | | .option("improved_null_detection", "true") | If `true`(default), values that contain only 0x0 ror DISPLAY strings and numbers will be considered `null`s instead of empty strings. | -| .option("strict_integral_precision", "true") | If `true`, Cobrix will not generate `short`/`integer`/`long` Spark data types, and always use `decimal(n)` with the exact precision that matches the copybook. | +| .option("strict_integral_precision", "true") | If `true`, Cobrix will not generate `short`/`integer`/`long` Spark data types, and always use `decimal(n)` with the exact precision that matches the copybook. Cannot be used together with `display_pic_always_string`. | | .option("binary_as_hex", "false") | By default fields that have `PIC X` and `USAGE COMP` are converted to `binary` Spark data type. If this option is set to `true`, such fields will be strings in HEX encoding. | ##### Modifier options diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala index 1e002457f..fb352a1e0 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala @@ -107,24 +107,25 @@ object CopybookParser extends Logging { * Tokenizes a Cobol Copybook contents and returns the AST. * * @param dataEncoding Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII. - * @param copyBookContents A string containing all lines of a copybook - * @param dropGroupFillers Drop groups marked as fillers from the output AST - * @param dropValueFillers Drop primitive fields marked as fillers from the output AST - * @param fillerNamingPolicy Specifies a naming policy for fillers + * @param copyBookContents A string containing all lines of a copybook. + * @param dropGroupFillers Drop groups marked as fillers from the output AST. + * @param dropValueFillers Drop primitive fields marked as fillers from the output AST. + * @param fillerNamingPolicy Specifies a naming policy for fillers. * @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically * resolving segment redefines. - * @param fieldParentMap A segment fields parent mapping - * @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed - * @param strictSignOverpunch If true sign overpunching is not allowed for unsigned numbers + * @param fieldParentMap A segment fields parent mapping. + * @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed. + * @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers. + * @param strictSignOverpunch If true sign overpunching is not allowed for unsigned numbers. * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null. - * @param commentPolicy Specifies a policy for comments truncation inside a copybook - * @param ebcdicCodePage A code page for EBCDIC encoded data - * @param asciiCharset A charset for ASCII encoded data + * @param commentPolicy Specifies a policy for comments truncation inside a copybook. + * @param ebcdicCodePage A code page for EBCDIC encoded data. + * @param asciiCharset A charset for ASCII encoded data. * @param isUtf16BigEndian If true UTF-16 strings are considered big-endian. - * @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754) - * @param nonTerminals A list of non-terminals that should be extracted as strings + * @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754). + * @param nonTerminals A list of non-terminals that should be extracted as strings. * @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw). - * @return Seq[Group] where a group is a record inside the copybook + * @return Seq[Group] where a group is a record inside the copybook. */ def parse(copyBookContents: String, dataEncoding: Encoding = EBCDIC, @@ -134,6 +135,7 @@ object CopybookParser extends Logging { segmentRedefines: Seq[String] = Nil, fieldParentMap: Map[String, String] = HashMap[String, String](), stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth, + isDisplayAlwaysString: Boolean = false, commentPolicy: CommentPolicy = CommentPolicy(), strictSignOverpunch: Boolean = true, improvedNullDetection: Boolean = false, @@ -155,6 +157,7 @@ object CopybookParser extends Logging { segmentRedefines, fieldParentMap, stringTrimmingPolicy, + isDisplayAlwaysString, commentPolicy, strictSignOverpunch, improvedNullDetection, @@ -180,6 +183,7 @@ object CopybookParser extends Logging { * @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically * @param fieldParentMap A segment fields parent mapping * @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed + * @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers * @param commentPolicy Specifies a policy for comments truncation inside a copybook * @param strictSignOverpunch If true sign overpunching is not allowed for unsigned numbers * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null. @@ -198,6 +202,7 @@ object CopybookParser extends Logging { segmentRedefines: Seq[String] = Nil, fieldParentMap: Map[String, String] = HashMap[String, String](), stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth, + isDisplayAlwaysString: Boolean = false, commentPolicy: CommentPolicy = CommentPolicy(), strictSignOverpunch: Boolean = true, improvedNullDetection: Boolean = false, @@ -219,6 +224,7 @@ object CopybookParser extends Logging { segmentRedefines, fieldParentMap, stringTrimmingPolicy, + isDisplayAlwaysString, commentPolicy, strictSignOverpunch, improvedNullDetection, @@ -265,6 +271,7 @@ object CopybookParser extends Logging { segmentRedefines: Seq[String], fieldParentMap: Map[String, String], stringTrimmingPolicy: StringTrimmingPolicy, + isDisplayAlwaysString: Boolean, commentPolicy: CommentPolicy, strictSignOverpunch: Boolean, improvedNullDetection: Boolean, @@ -279,7 +286,7 @@ object CopybookParser extends Logging { debugFieldsPolicy: DebugFieldsPolicy, fieldCodePageMap: Map[String, String]): Copybook = { - val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap) + val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, isDisplayAlwaysString, commentPolicy, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap) val nonTerms: Set[String] = (for (id <- nonTerminals) yield transformIdentifier(id) diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala index 670f4ec5a..9bafa234f 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala @@ -54,6 +54,7 @@ object ANTLRParser extends Logging { def parse(copyBookContents: String, enc: Encoding, stringTrimmingPolicy: StringTrimmingPolicy, + isDisplayAlwaysString: Boolean, commentPolicy: CommentPolicy, strictSignOverpunch: Boolean, improvedNullDetection: Boolean, @@ -64,7 +65,7 @@ object ANTLRParser extends Logging { isUtf16BigEndian: Boolean, floatingPointFormat: FloatingPointFormat, fieldCodePageMap: Map[String, String]): CopybookAST = { - val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, fieldCodePageMap) + val visitor = new ParserVisitor(enc, stringTrimmingPolicy, isDisplayAlwaysString, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, fieldCodePageMap) val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map( line => diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala index 20908ad08..5e5ed87ff 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala @@ -41,6 +41,7 @@ sealed trait Expr class ParserVisitor(enc: Encoding, stringTrimmingPolicy: StringTrimmingPolicy, + isDisplayAlwaysString: Boolean, ebcdicCodePage: CodePage, asciiCharset: Charset, isUtf16BigEndian: Boolean, @@ -854,7 +855,7 @@ class ParserVisitor(enc: Encoding, Map(), isDependee = false, identifier.toUpperCase() == Constants.FILLER, - DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision) + DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, isDisplayAlwaysString, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian = isUtf16BigEndian, floatingPointFormat, strictSignOverpunch = strictSignOverpunch, improvedNullDetection = improvedNullDetection, strictIntegralPrecision = strictIntegralPrecision) ) (Some(parent)) parent.children.append(prim) diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala index b7d91d2f3..bb652df28 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala @@ -73,7 +73,7 @@ class NonTerminalsAdder( ) val sz = g.binaryProperties.actualSize val dataType = AlphaNumeric(s"X($sz)", sz, enc = Some(enc)) - val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection) + val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, isDisplayAlwaysString = false, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection) val newName = getNonTerminalName(g.name, g.parent.get) newChildren.append( Primitive( diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala index 462fbf03f..791d6ef9f 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala @@ -56,6 +56,7 @@ object DecoderSelector { */ def getDecoder(dataType: CobolType, stringTrimmingPolicy: StringTrimmingPolicy = TrimBoth, + isDisplayAlwaysString: Boolean = false, ebcdicCodePage: CodePage = new CodePageCommon, asciiCharset: Charset = StandardCharsets.US_ASCII, isUtf16BigEndian: Boolean = true, @@ -66,6 +67,7 @@ object DecoderSelector { val decoder = dataType match { case alphaNumeric: AlphaNumeric => getStringDecoder(alphaNumeric.enc.getOrElse(EBCDIC), stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, improvedNullDetection) case decimalType: Decimal => getDecimalDecoder(decimalType, floatingPointFormat, strictSignOverpunch, improvedNullDetection) + case integralType: Integral if isDisplayAlwaysString => getDisplayDecoderAsString(integralType, improvedNullDetection, strictSignOverpunch) case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision) case _ => throw new IllegalStateException("Unknown AST object") } @@ -251,6 +253,29 @@ object DecoderSelector { } } + private[parser] def getDisplayDecoderAsString(integralType: Integral, + improvedNullDetection: Boolean, + strictSignOverpunch: Boolean): Decoder = { + val encoding = integralType.enc.getOrElse(EBCDIC) + val isSigned = integralType.signPosition.isDefined + val allowedSignOverpunch = isSigned || !strictSignOverpunch + + val isEbcdic = encoding match { + case EBCDIC => true + case _ => false + } + + if (isEbcdic) { + bytes: Array[Byte] => { + StringDecoders.decodeEbcdicNumber(bytes, !isSigned, allowedSignOverpunch, improvedNullDetection) + } + } else { + bytes: Array[Byte] => { + StringDecoders.decodeAsciiNumber(bytes, !isSigned, allowedSignOverpunch, improvedNullDetection) + } + } + } + /** Gets a decoder function for a binary encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */ private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean, strictIntegralPrecision: Boolean): Decoder = { val isSigned = signPosition.nonEmpty diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala index 8b1a21ee2..285a93087 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala @@ -49,6 +49,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten * @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record * @param schemaRetentionPolicy A copybook usually has a root group struct element that acts like a rowtag in XML. This can be retained in Spark schema or can be collapsed * @param stringTrimmingPolicy Specify if and how strings should be trimmed when parsed + * @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers * @param allowPartialRecords If true, partial ASCII records can be parsed (in cases when LF character is missing for example) * @param multisegmentParams Parameters for reading multisegment mainframe files * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null. @@ -87,6 +88,7 @@ case class CobolParameters( generateRecordBytes: Boolean, schemaRetentionPolicy: SchemaRetentionPolicy, stringTrimmingPolicy: StringTrimmingPolicy, + isDisplayAlwaysString: Boolean, allowPartialRecords: Boolean, multisegmentParams: Option[MultisegmentParameters], commentPolicy: CommentPolicy, diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala index bcd9c11b3..c1daa3a44 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala @@ -59,6 +59,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten * @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record * @param schemaPolicy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook. * @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed. + * @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers. * @param allowPartialRecords If true, partial ASCII records can be parsed (in cases when LF character is missing for example) * @param multisegment Parameters specific to reading multisegment files * @param commentPolicy A comment truncation policy @@ -108,6 +109,7 @@ case class ReaderParameters( generateRecordBytes: Boolean = false, schemaPolicy: SchemaRetentionPolicy = SchemaRetentionPolicy.CollapseRoot, stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth, + isDisplayAlwaysString: Boolean = false, allowPartialRecords: Boolean = false, multisegment: Option[MultisegmentParameters] = None, commentPolicy: CommentPolicy = CommentPolicy(), diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala index f45f67a7a..c3f0f374c 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/schema/CobolSchema.scala @@ -36,6 +36,7 @@ import scala.collection.immutable.HashMap * * @param copybook A parsed copybook. * @param policy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook. + * @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers * @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook. * @param generateRecordId If true, a record id field will be prepended to the beginning of the schema. * @param generateRecordBytes If true, a record bytes field will be appended to the beginning of the schema. @@ -46,6 +47,7 @@ import scala.collection.immutable.HashMap */ class CobolSchema(val copybook: Copybook, val policy: SchemaRetentionPolicy, + val isDisplayAlwaysString: Boolean, val strictIntegralPrecision: Boolean, val inputFileNameField: String, val generateRecordId: Boolean, @@ -93,6 +95,7 @@ object CobolSchema { segmentRedefines, fieldParentMap, readerParameters.stringTrimmingPolicy, + readerParameters.isDisplayAlwaysString, readerParameters.commentPolicy, readerParameters.strictSignOverpunch, readerParameters.improvedNullDetection, @@ -116,6 +119,7 @@ object CobolSchema { segmentRedefines, fieldParentMap, readerParameters.stringTrimmingPolicy, + readerParameters.isDisplayAlwaysString, readerParameters.commentPolicy, readerParameters.strictSignOverpunch, readerParameters.improvedNullDetection, @@ -132,7 +136,17 @@ object CobolSchema { )) val segIdFieldCount = readerParameters.multisegment.map(p => p.segmentLevelIds.size).getOrElse(0) val segmentIdPrefix = readerParameters.multisegment.map(p => p.segmentIdPrefix).getOrElse("") - new CobolSchema(schema, readerParameters.schemaPolicy, readerParameters.strictIntegralPrecision, readerParameters.inputFileNameColumn, readerParameters.generateRecordId, readerParameters.generateRecordBytes, segIdFieldCount, segmentIdPrefix, readerParameters.metadataPolicy) + new CobolSchema(schema, + readerParameters.schemaPolicy, + readerParameters.isDisplayAlwaysString, + readerParameters.strictIntegralPrecision, + readerParameters.inputFileNameColumn, + readerParameters.generateRecordId, + readerParameters.generateRecordBytes, + segIdFieldCount, + segmentIdPrefix, + readerParameters.metadataPolicy + ) } def getCodePage(codePageName: String, codePageClass: Option[String]): CodePage = { diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala index 8c6cd3a91..cbc25d306 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/DataSizeSpec.scala @@ -33,7 +33,9 @@ class DataSizeSpec extends AnyFunSuite { private val logger: Logger = LoggerFactory.getLogger(this.getClass) private def parse(pic: String): Primitive = { - val visitor = new ParserVisitor(ASCII, StringTrimmingPolicy.TrimNone, + val visitor = new ParserVisitor(ASCII, + StringTrimmingPolicy.TrimNone, + isDisplayAlwaysString = false, CodePage.getCodePageByName("common"), StandardCharsets.US_ASCII, isUtf16BigEndian = true, diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala index 020b00ff7..e5004c118 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/parse/PicValidationSpec.scala @@ -33,7 +33,9 @@ class PicValidationSpec extends AnyFunSuite { private def validatePic(pic: String) = { - val visitor = new ParserVisitor(ASCII, StringTrimmingPolicy.TrimNone, + val visitor = new ParserVisitor(ASCII, + StringTrimmingPolicy.TrimNone, + isDisplayAlwaysString = false, CodePage.getCodePageByName("common"), StandardCharsets.UTF_8, isUtf16BigEndian = true, diff --git a/pom.xml b/pom.xml index a3b3a45c7..e265e02fa 100644 --- a/pom.xml +++ b/pom.xml @@ -110,7 +110,7 @@ 2.12.20 2.12 - 3.4.4 + 3.5.2 3.2.14 2.4.16 15.0 diff --git a/spark-cobol/pom.xml b/spark-cobol/pom.xml index 960285ec6..dff3714a3 100644 --- a/spark-cobol/pom.xml +++ b/spark-cobol/pom.xml @@ -57,6 +57,12 @@ cobol-parser_${scala.compat.version} ${project.version} + + + org.slf4j + slf4j-log4j12 + test + diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala index d1034f2f1..91cb0c4e7 100644 --- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala +++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala @@ -72,6 +72,7 @@ object CobolParametersParser extends Logging { val PARAM_VALUE_FILLERS = "drop_value_fillers" val PARAM_FILLER_NAMING_POLICY = "filler_naming_policy" val PARAM_STRICT_INTEGRAL_PRECISION = "strict_integral_precision" + val PARAM_DISPLAY_PIC_ALWAYS_STRING = "display_pic_always_string" val PARAM_GROUP_NOT_TERMINALS = "non_terminals" val PARAM_OCCURS_MAPPINGS = "occurs_mappings" @@ -270,6 +271,7 @@ object CobolParametersParser extends Logging { params.getOrElse(PARAM_GENERATE_RECORD_BYTES, "false").toBoolean, schemaRetentionPolicy, stringTrimmingPolicy, + params.getOrElse(PARAM_DISPLAY_PIC_ALWAYS_STRING, "false").toBoolean, params.getOrElse(PARAM_ALLOW_PARTIAL_RECORDS, "false").toBoolean, parseMultisegmentParameters(params), parseCommentTruncationPolicy(params), @@ -409,6 +411,7 @@ object CobolParametersParser extends Logging { generateRecordBytes = parameters.generateRecordBytes, schemaPolicy = parameters.schemaRetentionPolicy, stringTrimmingPolicy = parameters.stringTrimmingPolicy, + isDisplayAlwaysString = parameters.isDisplayAlwaysString, allowPartialRecords = parameters.allowPartialRecords, parameters.multisegmentParams, parameters.commentPolicy, @@ -922,6 +925,10 @@ object CobolParametersParser extends Logging { throw new IllegalArgumentException(s"'$PARAM_MINIMUM_RECORD_LENGTH' ($min) should be >= '$PARAM_MAXIMUM_RECORD_LENGTH' ($max).") } + if (params.contains(PARAM_DISPLAY_PIC_ALWAYS_STRING) && params(PARAM_DISPLAY_PIC_ALWAYS_STRING).toBoolean && + params.contains(PARAM_STRICT_INTEGRAL_PRECISION) && params(PARAM_STRICT_INTEGRAL_PRECISION).toBoolean) + throw new IllegalArgumentException(s"Options '$PARAM_DISPLAY_PIC_ALWAYS_STRING' and '$PARAM_STRICT_INTEGRAL_PRECISION' cannot be used together.") + if (unusedKeys.nonEmpty) { val unusedKeyStr = unusedKeys.mkString(",") val msg = s"Redundant or unrecognized option(s) to 'spark-cobol': $unusedKeyStr." diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala index 2b8a7098e..c67a55071 100644 --- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala +++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala @@ -51,6 +51,7 @@ import scala.collection.mutable.ArrayBuffer */ class CobolSchema(copybook: Copybook, schemaRetentionPolicy: SchemaRetentionPolicy, + isDisplayAlwaysString: Boolean = false, strictIntegralPrecision: Boolean = false, inputFileNameField: String = "", generateRecordId: Boolean = false, @@ -58,10 +59,16 @@ class CobolSchema(copybook: Copybook, generateSegIdFieldsCnt: Int = 0, segmentIdProvidedPrefix: String = "", metadataPolicy: MetadataPolicy = MetadataPolicy.Basic) - extends CobolReaderSchema( - copybook, schemaRetentionPolicy, strictIntegralPrecision, inputFileNameField, generateRecordId, generateRecordBytes, - generateSegIdFieldsCnt, segmentIdProvidedPrefix - ) with Logging with Serializable { + extends CobolReaderSchema(copybook, + schemaRetentionPolicy, + isDisplayAlwaysString, + strictIntegralPrecision, + inputFileNameField, + generateRecordId, + generateRecordBytes, + generateSegIdFieldsCnt, + segmentIdProvidedPrefix + ) with Logging with Serializable { @throws(classOf[IllegalStateException]) private[this] lazy val sparkSchema = createSparkSchema() @@ -184,6 +191,10 @@ class CobolSchema(copybook: Copybook, case Some(RAW) => BinaryType case _ => StringType } + case dt: Integral if isDisplayAlwaysString => + if (metadataPolicy != MetadataPolicy.NoMetadata) + addIntegralStringMetadata(metadata, dt) + StringType case dt: Integral if strictIntegralPrecision => DecimalType(precision = dt.precision, scale = 0) case dt: Integral => @@ -223,6 +234,11 @@ class CobolSchema(copybook: Copybook, metadataBuilder.putLong(MAX_LENGTH, a.length) } + private def addIntegralStringMetadata(metadataBuilder: MetadataBuilder, i: Integral): MetadataBuilder = { + val maxLength = if (i.signPosition.isDefined) i.precision + 1 else i.precision + metadataBuilder.putLong(MAX_LENGTH, maxLength) + } + private def addExtendedMetadata(metadataBuilder: MetadataBuilder, s: Statement): MetadataBuilder = { metadataBuilder.putLong("level", s.level) if (s.originalName.nonEmpty && s.originalName != s.name) @@ -288,6 +304,7 @@ object CobolSchema { new CobolSchema( schema.copybook, schema.policy, + schema.isDisplayAlwaysString, schema.strictIntegralPrecision, schema.inputFileNameField, schema.generateRecordId, diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala index 79822d204..27d8c44e8 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaHierarchicalSpec.scala @@ -102,6 +102,6 @@ class CobolSchemaHierarchicalSpec extends AnyWordSpec { private def parseSchema(copybook: String, segmentRedefines: List[String], fieldParentMap: Map[String, String]): CobolSchema = { val parsedSchema = CopybookParser.parseTree(copybook, segmentRedefines = segmentRedefines, fieldParentMap = fieldParentMap) - new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "",false, false) + new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "",false, false) } } diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala index 8d3c20340..b25117b4b 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/CobolSchemaSpec.scala @@ -55,7 +55,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -73,7 +73,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, true, "", false, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, true, "", false, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -94,7 +94,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", true, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -113,7 +113,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, true) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, true) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -135,7 +135,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, true) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", true, true) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -163,7 +163,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", true, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -179,7 +179,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", false, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -196,7 +196,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", true, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -211,7 +211,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, true) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, true) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -229,7 +229,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, true) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", true, true) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -243,7 +243,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, false) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -272,7 +272,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, false, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", true, false, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -290,7 +290,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, true, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", false, true, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -311,7 +311,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", true, true, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", true, true, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -332,7 +332,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, true, "", true, true, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, true, "", true, true, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -349,7 +349,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", false, false, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -367,7 +367,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", true, false, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", true, false, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -382,7 +382,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { | |-- STR_FLD: string (nullable = true) |""".stripMargin.replaceAll("[\\r\\n]", "\n") val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false, 2) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, false, 2) val actualSchema = cobolSchema.getSparkSchema.treeString assertEqualsMultiline(actualSchema, expectedSchema) @@ -401,7 +401,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, "", false, false) + val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.KeepOriginal, false, false, "", false, false) val actualSparkSchema = cobolSchema1.getSparkSchema val rootField = actualSparkSchema.fields.head.dataType.asInstanceOf[StructType] @@ -430,7 +430,7 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { val parsedSchema = CopybookParser.parseTree(copyBook) - val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "", false, false) + val cobolSchema1 = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "", false, false) val actualSparkSchema = cobolSchema1.getSparkSchema val metadataStr1 = actualSparkSchema.fields.head.metadata @@ -447,6 +447,29 @@ class CobolSchemaSpec extends AnyWordSpec with SimpleComparisonBase { assert(metadataStr2.getLong(MAX_LENGTH) == 7) } + "String display types" in { + val copyBook: String = + """ 01 RECORD. + | 05 STR1 PIC 9(10). + | 05 STR2 PIC S9(7). + | 05 NUM3 PIC 9V99(7). + |""".stripMargin + + val expectedSchema = + """root + | |-- Seg_Id0: string (nullable = true) + | |-- Seg_Id1: string (nullable = true) + | |-- STR1: string (nullable = true) + | |-- STR2: string (nullable = true) + | |-- NUM3: decimal(9,8) (nullable = true)""".stripMargin + + val parsedSchema = CopybookParser.parseTree(copyBook) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, true, false, "", false, false, 2) + val actualSchema = cobolSchema.getSparkSchema.treeString + + assertEqualsMultiline(actualSchema, expectedSchema) + } + "fromSparkOptions" should { "return a schema for a copybook" in { val copybook: String = diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/Test01DisplayPicAsStrings.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/Test01DisplayPicAsStrings.scala new file mode 100644 index 000000000..da247f6a7 --- /dev/null +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/Test01DisplayPicAsStrings.scala @@ -0,0 +1,149 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.spark.cobol.source + +import org.scalatest.funsuite.AnyFunSuite +import org.slf4j.{Logger, LoggerFactory} +import za.co.absa.cobrix.spark.cobol.parameters.MetadataFields.MAX_LENGTH +import za.co.absa.cobrix.spark.cobol.source.base.{SimpleComparisonBase, SparkTestBase} +import za.co.absa.cobrix.spark.cobol.source.fixtures.BinaryFileFixture +import za.co.absa.cobrix.spark.cobol.utils.SparkUtils + +class Test01DisplayPicAsStrings extends AnyFunSuite with SparkTestBase with BinaryFileFixture with SimpleComparisonBase { + + private implicit val logger: Logger = LoggerFactory.getLogger(this.getClass) + + private val copybook = + """ 01 R. + 03 N1 PIC 9(4). + 03 N2 PIC S9(4). + 03 D1 PIC 99V99. + 03 D2 PIC 99.99. + """ + + val binFileContents: Array[Byte] = "0001 2{011012.342010K111002200.01 300001J 1 .02".getBytes() + + test("Test a numeric fields having DISPLAY format are parsed as numbers") { + withTempBinFile("num_display1", ".dat", binFileContents) { tmpFileName => + val df = spark + .read + .format("cobol") + .option("copybook_contents", copybook) + .option("pedantic", "true") + .option("encoding", "ascii") + .load(tmpFileName) + + val expectedSchema = + """root + | |-- N1: integer (nullable = true) + | |-- N2: integer (nullable = true) + | |-- D1: decimal(4,2) (nullable = true) + | |-- D2: decimal(4,2) (nullable = true) + |""".stripMargin.replace("\r\n", "\n") + + val expectedData = + """[ { + | "N1" : 1, + | "N2" : 20, + | "D1" : 1.1, + | "D2" : 12.34 + |}, { + | "N1" : 2010, + | "N2" : -2111, + | "D1" : 0.22, + | "D2" : 0.01 + |}, { + | "N1" : 300, + | "N2" : -11, + | "D1" : 0.01, + | "D2" : 0.02 + |} ]""".stripMargin.replace("\r\n", "\n") + + val actualSchema = df.schema.treeString + val actualData = SparkUtils.prettyJSON(df.toJSON.collect().mkString("[", ",", "]")) + + assertEqualsMultiline(actualSchema, expectedSchema) + assertEqualsMultiline(actualData, expectedData) + } + } + + test("Test a numeric fields having DISPLAY format are parsed as strings") { + withTempBinFile("num_display2", ".dat", binFileContents) { tmpFileName => + val df = spark + .read + .format("cobol") + .option("copybook_contents", copybook) + .option("encoding", "ascii") + .option("display_pic_always_string", "true") + .option("pedantic", "true") + .load(tmpFileName) + + val expectedSchema = + """root + | |-- N1: string (nullable = true) + | |-- N2: string (nullable = true) + | |-- D1: decimal(4,2) (nullable = true) + | |-- D2: decimal(4,2) (nullable = true) + |""".stripMargin.replace("\r\n", "\n") + + val expectedData = + """[ { + | "N1" : "0001", + | "N2" : "+20", + | "D1" : 1.1, + | "D2" : 12.34 + |}, { + | "N1" : "2010", + | "N2" : "-2111", + | "D1" : 0.22, + | "D2" : 0.01 + |}, { + | "N1" : "300", + | "N2" : "-0011", + | "D1" : 0.01, + | "D2" : 0.02 + |} ]""".stripMargin.replace("\r\n", "\n") + + val actualSchema = df.schema.treeString + val actualData = SparkUtils.prettyJSON(df.toJSON.collect().mkString("[", ",", "]")) + + assert(df.schema.fields.head.metadata.getLong(MAX_LENGTH) == 4) + assert(df.schema.fields(1).metadata.getLong(MAX_LENGTH) == 5) + + assertEqualsMultiline(actualSchema, expectedSchema) + assertEqualsMultiline(actualData, expectedData) + } + } + + test("Test incompatible options used together") { + withTempBinFile("num_display2", ".dat", binFileContents) { tmpFileName => + val ex = intercept[IllegalArgumentException] { + spark + .read + .format("cobol") + .option("copybook_contents", copybook) + .option("encoding", "ascii") + .option("strict_integral_precision", "true") + .option("display_pic_always_string", "true") + .option("pedantic", "true") + .load(tmpFileName) + } + + assert(ex.getMessage == "Options 'display_pic_always_string' and 'strict_integral_precision' cannot be used together.") + } + } +} diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala index a4604bf91..3731efa8a 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/SparkSchemaSpec.scala @@ -37,7 +37,7 @@ class SparkSchemaSpec extends AnyFunSuite { val parsedSchema = CopybookParser.parseTree(copyBookContents) - val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, "",false, false) + val cobolSchema = new CobolSchema(parsedSchema, SchemaRetentionPolicy.CollapseRoot, false, false, "",false, false) val sparkSchema = cobolSchema.getSparkSchema diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala index 2a9a7cc86..77fb982b1 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/base/impl/DummyCobolSchema.scala @@ -24,7 +24,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy import scala.collection.Seq -class DummyCobolSchema(val sparkSchema: StructType) extends CobolSchema(new Copybook(Group.root), SchemaRetentionPolicy.KeepOriginal, false, "", false, false) with Serializable { +class DummyCobolSchema(val sparkSchema: StructType) extends CobolSchema(new Copybook(Group.root), SchemaRetentionPolicy.KeepOriginal, false, false, "", false, false) with Serializable { override def getSparkSchema: StructType = sparkSchema override lazy val getRecordSize: Int = 40 diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala index 7c7712ae6..746a41c19 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/text/Test02TextFilesOldSchool.scala @@ -52,7 +52,7 @@ class Test02TextFilesOldSchool extends AnyFunSuite with SparkTestBase with Binar withTempTextFile("text_ascii", ".txt", StandardCharsets.UTF_8, textFileContent) { tmpFileName => val parsedCopybook = CopybookParser.parse(copybook, dataEncoding = ASCII, stringTrimmingPolicy = StringTrimmingPolicy.TrimNone) - val cobolSchema = new CobolSchema(parsedCopybook, SchemaRetentionPolicy.CollapseRoot, false, "", false) + val cobolSchema = new CobolSchema(parsedCopybook, SchemaRetentionPolicy.CollapseRoot, false, false, "", false) val sparkSchema = cobolSchema.getSparkSchema val rddText = spark.sparkContext