Skip to content

Commit 47ebf5f

Browse files
committed
#792 Implement lazy evaluated encoders for all single character code pages.
1 parent ad6729a commit 47ebf5f

File tree

4 files changed

+45
-14
lines changed

4 files changed

+45
-14
lines changed

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1144.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ package za.co.absa.cobrix.cobol.parser.encoding.codepage
2222
* It corresponds to code page 280 and only differs from it in position 9F, where the euro sign € is located instead
2323
* of the international currency symbol ¤.
2424
*/
25-
class CodePage1144 extends SingleByteCodePage(CodePage1144.ebcdicToAsciiMapping, Some(CodePage1144.asciiToEbcdicMapping)) {
25+
class CodePage1144 extends SingleByteCodePage(CodePage1144.ebcdicToAsciiMapping) {
2626
override def codePageShortName: String = "cp1144"
2727
}
2828

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageCommon.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ package za.co.absa.cobrix.cobol.parser.encoding.codepage
2121
*
2222
* It is an "invariant" subset of EBCDIC. Each converted symbol should be present in all EBCDIC pages.
2323
*/
24-
class CodePageCommon extends SingleByteCodePage(CodePageCommon.ebcdicToAsciiMapping, Some(CodePageCommon.asciiToEbcdicMapping)) {
24+
class CodePageCommon extends SingleByteCodePage(CodePageCommon.ebcdicToAsciiMapping) {
2525
override def codePageShortName: String = "common"
2626
}
2727

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/SingleByteCodePage.scala

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import java.util
2121
/**
2222
* The base class for all single-byte EBCDIC decoders.
2323
*/
24-
abstract class SingleByteCodePage(ebcdicToAsciiMapping: Array[Char], asciiToEbcdicMapping: Option[Int => Byte]=None)
24+
abstract class SingleByteCodePage(ebcdicToAsciiMapping: Array[Char])
2525
extends CodePage {
2626
private val ConversionTableElements = 256
2727
private val conversionTable = ebcdicToAsciiMapping
@@ -34,7 +34,7 @@ abstract class SingleByteCodePage(ebcdicToAsciiMapping: Array[Char], asciiToEbcd
3434
/**
3535
* Decodes bytes encoded as single byte EBCDIC code page to string.
3636
*/
37-
final def convert(bytes: Array[Byte]): String = {
37+
final override def convert(bytes: Array[Byte]): String = {
3838
var i = 0
3939
val buf = new StringBuffer(bytes.length)
4040
while (i < bytes.length) {
@@ -51,10 +51,8 @@ abstract class SingleByteCodePage(ebcdicToAsciiMapping: Array[Char], asciiToEbcd
5151
* @param length The length of the output (in bytes)
5252
* @return A string representation of the binary data
5353
*/
54-
def convert(string: String, length: Int): Array[Byte] = {
54+
final override def convert(string: String, length: Int): Array[Byte] = {
5555
require(length >= 0, s"Field length cannot be negative, got $length")
56-
require(asciiToEbcdicMapping.isDefined, s"Cannot encode strings for Code Page without ASCII to EBCDIC " +
57-
s"mapping ${this.getClass.getSimpleName}")
5856

5957
var i = 0
6058
val buf = new Array[Byte](length)
@@ -63,12 +61,22 @@ abstract class SingleByteCodePage(ebcdicToAsciiMapping: Array[Char], asciiToEbcd
6361
util.Arrays.fill(buf, 0x40.toByte)
6462

6563
while (i < string.length && i < length) {
66-
val unicodeCodePoint: Int = string.codePointAt(i)
67-
buf(i) = asciiToEbcdicMapping.get(unicodeCodePoint)
64+
buf(i) = reverseEbcdicToAsciiMapping.getOrElse(string.charAt(i), 0x40.toByte)
6865
i = i + 1
6966
}
7067
buf
7168
}
7269

73-
override def supportsEncoding: Boolean = asciiToEbcdicMapping.isDefined
70+
override def supportsEncoding: Boolean = true
71+
72+
lazy val reverseEbcdicToAsciiMapping: Map[Char, Byte] = {
73+
val map = scala.collection.mutable.Map[Char, Byte]()
74+
for (i <- ebcdicToAsciiMapping.indices) {
75+
val asciiChar = ebcdicToAsciiMapping(i)
76+
if (!map.contains(asciiChar)) {
77+
map(asciiChar) = i.toByte
78+
}
79+
}
80+
map.toMap
81+
}
7482
}

cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -251,9 +251,29 @@ class StringDecodersSpec extends AnyWordSpec {
251251
assert(actual == expected)
252252
}
253253

254+
"decode a CP1025 string special characters" in {
255+
val expectedUnicode = "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяABCDEFGHIJKLMNOPQRSTUVWXYZ123456789[]\r\n"
256+
val ebcdicBytes = Array(
257+
0xB9, 0xBA, 0xED, 0xBF, 0xBC, 0xBD, 0xEC, 0xFA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xDA, 0xDB, 0xDC,
258+
0xDE, 0xDF, 0xEA, 0xEB, 0xBE, 0xCA, 0xBB, 0xFE, 0xFB, 0xFD, 0x57, 0xEF, 0xEE, 0xFC, 0xB8, 0xDD,
259+
0x77, 0x78, 0xAF, 0x8D, 0x8A, 0x8B, 0xAE, 0xB2, 0x8F, 0x90, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
260+
0xAA, 0xAB, 0xAC, 0xAD, 0x8C, 0x8E, 0x80, 0xB6, 0xB3, 0xB5, 0xB7, 0xB1, 0xB0, 0xB4, 0x76, 0xA0,
261+
0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
262+
0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6,
263+
0xF7, 0xF8, 0xF9, 0x4A, 0x5A, 0x25, 0x0D,
264+
).map(_.toByte)
265+
266+
val enc = new CodePage1025
267+
val actualUnicode = decodeEbcdicString(ebcdicBytes, KeepAll, enc, improvedNullDetection = false)
268+
val actualEbcdicBytes = enc.convert(expectedUnicode, expectedUnicode.length)
269+
270+
assert(actualUnicode == expectedUnicode)
271+
assert(actualEbcdicBytes.sameElements(ebcdicBytes))
272+
}
273+
254274
"decode a CP1140 string special characters" in {
255-
val expected = "âäàáãåçñ¢.<(+|&éêëèíîïìß!$*);¬-/ÂÄÀÁÃÅÇѦ,%_>?øÉÊËÈÍÎÏÌ`:#@'=\"Øabcdefghi«»ðýþ±°jklmnopqrªºæ¸Æ€µ~stuvwxyz¡¿ÐÝÞ®^£¥·©§¶¼½¾[]¯¨´×{ABCDEFGHI\u00ADôöòóõ}JKLMNOPQR¹ûüùúÿ\\÷STUVWXYZ²ÔÖÒÓÕ0123456789³ÛÜÙÚ"
256-
val bytes = Array(
275+
val expectedUnicode = "âäàáãåçñ¢.<(+|&éêëèíîïìß!$*);¬-/ÂÄÀÁÃÅÇѦ,%_>?øÉÊËÈÍÎÏÌ`:#@'=\"Øabcdefghi«»ðýþ±°jklmnopqrªºæ¸Æ€µ~stuvwxyz¡¿ÐÝÞ®^£¥·©§¶¼½¾[]¯¨´×{ABCDEFGHI\u00ADôöòóõ}JKLMNOPQR¹ûüùúÿ\\÷STUVWXYZ²ÔÖÒÓÕ0123456789³ÛÜÙÚ"
276+
val ebcdicBytes = Array(
257277
0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
258278
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
259279
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
@@ -268,9 +288,12 @@ class StringDecodersSpec extends AnyWordSpec {
268288
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE
269289
).map(_.toByte)
270290

271-
val actual = decodeEbcdicString(bytes, KeepAll, new CodePage1140, improvedNullDetection = false)
291+
val enc = new CodePage1140
292+
val actualUnicode = decodeEbcdicString(ebcdicBytes, KeepAll, enc, improvedNullDetection = false)
293+
val actualEbcdicBytes = enc.convert(expectedUnicode, expectedUnicode.length)
272294

273-
assert(actual == expected)
295+
assert(actualUnicode == expectedUnicode)
296+
assert(actualEbcdicBytes.sameElements(ebcdicBytes))
274297
}
275298

276299
"decode a CP1141 string special characters" in {

0 commit comments

Comments
 (0)