diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/EncoderSelector.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/EncoderSelector.scala index ddce9a04..188b13a3 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/EncoderSelector.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/EncoderSelector.scala @@ -65,9 +65,9 @@ object EncoderSelector { fieldLength: Int ): Option[Encoder] = { encoding match { - case EBCDIC => + case EBCDIC if ebcdicCodePage.supportsEncoding => val encoder = (a: Any) => { - encodeEbcdicString(a.toString, CodePageCommon.asciiToEbcdicMapping, fieldLength) + ebcdicCodePage.convert(a.toString, fieldLength) } Option(encoder) case ASCII => @@ -77,31 +77,6 @@ object EncoderSelector { } } - /** - * An encoder from a ASCII basic string to an EBCDIC byte array - * - * @param string An input string - * @param conversionTable A conversion table to use to convert from ASCII to EBCDIC - * @param length The length of the output (in bytes) - * @return A string representation of the binary data - */ - def encodeEbcdicString(string: String, conversionTable: Array[Byte], length: Int): Array[Byte] = { - require(length >= 0, s"Field length cannot be negative, got $length") - - var i = 0 - val buf = new Array[Byte](length) - - // PIC X fields are space-filled on mainframe. Use EBCDIC space 0x40. - util.Arrays.fill(buf, 0x40.toByte) - - while (i < string.length && i < length) { - val asciiByte = string(i).toByte - buf(i) = conversionTable((asciiByte + 256) % 256) - i = i + 1 - } - buf - } - def getBinaryEncoder(compression: Option[Usage], precision: Int, scale: Int, diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala index 0e4a6589..5409c99d 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala @@ -18,6 +18,8 @@ package za.co.absa.cobrix.cobol.parser.encoding.codepage import za.co.absa.cobrix.cobol.internal.Logging +import java.util + /** * A trait for generalizing EBCDIC to ASCII conversion tables for different EBCDIC code pages. */ @@ -31,6 +33,23 @@ abstract class CodePage extends Serializable { * Converts an array of bytes to string according to the rules of the code page. */ def convert(bytes: Array[Byte]): String + + /** + * An encoder from a ASCII basic string to an EBCDIC byte array + * Users of this method should check whether the CodePage supports the encoding before calling it + * + * @param string An input string + * @param length The length of the output (in bytes) + * @return A string representation of the binary data + */ + def convert(string: String, length: Int): Array[Byte] + + /** + * Returns whether the CodePage is able to encode strings to ebcdic + * + * @return + */ + def supportsEncoding: Boolean } object CodePage extends Logging { diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1144.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1144.scala index 31de8e16..01e2ee83 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1144.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1144.scala @@ -22,7 +22,7 @@ package za.co.absa.cobrix.cobol.parser.encoding.codepage * It corresponds to code page 280 and only differs from it in position 9F, where the euro sign € is located instead * of the international currency symbol ¤. */ -class CodePage1144 extends SingleByteCodePage(CodePage1144.ebcdicToAsciiMapping) { +class CodePage1144 extends SingleByteCodePage(CodePage1144.ebcdicToAsciiMapping, Some(CodePage1144.asciiToEbcdicMapping)) { override def codePageShortName: String = "cp1144" } @@ -53,5 +53,268 @@ object CodePage1144 { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '³', 'Û', 'Ü', 'Ù', 'Ú', spc) // 240 - 255 } ebcdic2ascii + + } + + /** + * To generate conversion mapping use the python script shared in the PR. + */ + val asciiToEbcdicMapping: Int => Byte = { + case 0 => 0x00.toByte + case 1 => 0x01.toByte + case 2 => 0x02.toByte + case 3 => 0x03.toByte + case 156 => 0x04.toByte + case 9 => 0x05.toByte + case 134 => 0x06.toByte + case 127 => 0x07.toByte + case 151 => 0x08.toByte + case 141 => 0x09.toByte + case 142 => 0x0a.toByte + case 11 => 0x0b.toByte + case 12 => 0x0c.toByte + case 13 => 0x0d.toByte + case 14 => 0x0e.toByte + case 15 => 0x0f.toByte + case 16 => 0x10.toByte + case 17 => 0x11.toByte + case 18 => 0x12.toByte + case 19 => 0x13.toByte + case 157 => 0x14.toByte + case 8 => 0x16.toByte + case 135 => 0x17.toByte + case 24 => 0x18.toByte + case 25 => 0x19.toByte + case 146 => 0x1a.toByte + case 143 => 0x1b.toByte + case 28 => 0x1c.toByte + case 29 => 0x1d.toByte + case 30 => 0x1e.toByte + case 31 => 0x1f.toByte + case 128 => 0x20.toByte + case 129 => 0x21.toByte + case 130 => 0x22.toByte + case 131 => 0x23.toByte + case 132 => 0x24.toByte + case 10 => 0x25.toByte //NL and LF EBCDIC representation map to LF in Unicode. Choosing to always map LF to LF + case 23 => 0x26.toByte + case 27 => 0x27.toByte + case 136 => 0x28.toByte + case 137 => 0x29.toByte + case 138 => 0x2a.toByte + case 139 => 0x2b.toByte + case 140 => 0x2c.toByte + case 5 => 0x2d.toByte + case 6 => 0x2e.toByte + case 7 => 0x2f.toByte + case 144 => 0x30.toByte + case 145 => 0x31.toByte + case 22 => 0x32.toByte + case 147 => 0x33.toByte + case 148 => 0x34.toByte + case 149 => 0x35.toByte + case 150 => 0x36.toByte + case 4 => 0x37.toByte + case 152 => 0x38.toByte + case 153 => 0x39.toByte + case 154 => 0x3a.toByte + case 155 => 0x3b.toByte + case 20 => 0x3c.toByte + case 21 => 0x3d.toByte + case 158 => 0x3e.toByte + case 26 => 0x3f.toByte + case 32 => 0x40.toByte + case 160 => 0x41.toByte + case 226 => 0x42.toByte + case 228 => 0x43.toByte + case 123 => 0x44.toByte + case 225 => 0x45.toByte + case 227 => 0x46.toByte + case 229 => 0x47.toByte + case 92 => 0x48.toByte + case 241 => 0x49.toByte + case 176 => 0x4a.toByte + case 46 => 0x4b.toByte + case 60 => 0x4c.toByte + case 40 => 0x4d.toByte + case 43 => 0x4e.toByte + case 33 => 0x4f.toByte + case 38 => 0x50.toByte + case 93 => 0x51.toByte + case 234 => 0x52.toByte + case 235 => 0x53.toByte + case 125 => 0x54.toByte + case 237 => 0x55.toByte + case 238 => 0x56.toByte + case 239 => 0x57.toByte + case 126 => 0x58.toByte + case 223 => 0x59.toByte + case 233 => 0x5a.toByte + case 36 => 0x5b.toByte + case 42 => 0x5c.toByte + case 41 => 0x5d.toByte + case 59 => 0x5e.toByte + case 94 => 0x5f.toByte + case 45 => 0x60.toByte + case 47 => 0x61.toByte + case 194 => 0x62.toByte + case 196 => 0x63.toByte + case 192 => 0x64.toByte + case 193 => 0x65.toByte + case 195 => 0x66.toByte + case 197 => 0x67.toByte + case 199 => 0x68.toByte + case 209 => 0x69.toByte + case 242 => 0x6a.toByte + case 44 => 0x6b.toByte + case 37 => 0x6c.toByte + case 95 => 0x6d.toByte + case 62 => 0x6e.toByte + case 63 => 0x6f.toByte + case 248 => 0x70.toByte + case 201 => 0x71.toByte + case 202 => 0x72.toByte + case 203 => 0x73.toByte + case 200 => 0x74.toByte + case 205 => 0x75.toByte + case 206 => 0x76.toByte + case 207 => 0x77.toByte + case 204 => 0x78.toByte + case 249 => 0x79.toByte + case 58 => 0x7a.toByte + case 163 => 0x7b.toByte + case 167 => 0x7c.toByte + case 39 => 0x7d.toByte + case 61 => 0x7e.toByte + case 34 => 0x7f.toByte + case 216 => 0x80.toByte + case 97 => 0x81.toByte + case 98 => 0x82.toByte + case 99 => 0x83.toByte + case 100 => 0x84.toByte + case 101 => 0x85.toByte + case 102 => 0x86.toByte + case 103 => 0x87.toByte + case 104 => 0x88.toByte + case 105 => 0x89.toByte + case 171 => 0x8a.toByte + case 187 => 0x8b.toByte + case 240 => 0x8c.toByte + case 253 => 0x8d.toByte + case 254 => 0x8e.toByte + case 177 => 0x8f.toByte + case 91 => 0x90.toByte + case 106 => 0x91.toByte + case 107 => 0x92.toByte + case 108 => 0x93.toByte + case 109 => 0x94.toByte + case 110 => 0x95.toByte + case 111 => 0x96.toByte + case 112 => 0x97.toByte + case 113 => 0x98.toByte + case 114 => 0x99.toByte + case 170 => 0x9a.toByte + case 186 => 0x9b.toByte + case 230 => 0x9c.toByte + case 184 => 0x9d.toByte + case 198 => 0x9e.toByte + case 8364 => 0x9f.toByte + case 181 => 0xa0.toByte + case 236 => 0xa1.toByte + case 115 => 0xa2.toByte + case 116 => 0xa3.toByte + case 117 => 0xa4.toByte + case 118 => 0xa5.toByte + case 119 => 0xa6.toByte + case 120 => 0xa7.toByte + case 121 => 0xa8.toByte + case 122 => 0xa9.toByte + case 161 => 0xaa.toByte + case 191 => 0xab.toByte + case 208 => 0xac.toByte + case 221 => 0xad.toByte + case 222 => 0xae.toByte + case 174 => 0xaf.toByte + case 162 => 0xb0.toByte + case 35 => 0xb1.toByte + case 165 => 0xb2.toByte + case 183 => 0xb3.toByte + case 169 => 0xb4.toByte + case 64 => 0xb5.toByte + case 182 => 0xb6.toByte + case 188 => 0xb7.toByte + case 189 => 0xb8.toByte + case 190 => 0xb9.toByte + case 172 => 0xba.toByte + case 124 => 0xbb.toByte + case 175 => 0xbc.toByte + case 168 => 0xbd.toByte + case 180 => 0xbe.toByte + case 215 => 0xbf.toByte + case 224 => 0xc0.toByte + case 65 => 0xc1.toByte + case 66 => 0xc2.toByte + case 67 => 0xc3.toByte + case 68 => 0xc4.toByte + case 69 => 0xc5.toByte + case 70 => 0xc6.toByte + case 71 => 0xc7.toByte + case 72 => 0xc8.toByte + case 73 => 0xc9.toByte + case 173 => 0xca.toByte + case 244 => 0xcb.toByte + case 246 => 0xcc.toByte + case 166 => 0xcd.toByte + case 243 => 0xce.toByte + case 245 => 0xcf.toByte + case 232 => 0xd0.toByte + case 74 => 0xd1.toByte + case 75 => 0xd2.toByte + case 76 => 0xd3.toByte + case 77 => 0xd4.toByte + case 78 => 0xd5.toByte + case 79 => 0xd6.toByte + case 80 => 0xd7.toByte + case 81 => 0xd8.toByte + case 82 => 0xd9.toByte + case 185 => 0xda.toByte + case 251 => 0xdb.toByte + case 252 => 0xdc.toByte + case 96 => 0xdd.toByte + case 250 => 0xde.toByte + case 255 => 0xdf.toByte + case 231 => 0xe0.toByte + case 247 => 0xe1.toByte + case 83 => 0xe2.toByte + case 84 => 0xe3.toByte + case 85 => 0xe4.toByte + case 86 => 0xe5.toByte + case 87 => 0xe6.toByte + case 88 => 0xe7.toByte + case 89 => 0xe8.toByte + case 90 => 0xe9.toByte + case 178 => 0xea.toByte + case 212 => 0xeb.toByte + case 214 => 0xec.toByte + case 210 => 0xed.toByte + case 211 => 0xee.toByte + case 213 => 0xef.toByte + case 48 => 0xf0.toByte + case 49 => 0xf1.toByte + case 50 => 0xf2.toByte + case 51 => 0xf3.toByte + case 52 => 0xf4.toByte + case 53 => 0xf5.toByte + case 54 => 0xf6.toByte + case 55 => 0xf7.toByte + case 56 => 0xf8.toByte + case 57 => 0xf9.toByte + case 179 => 0xfa.toByte + case 219 => 0xfb.toByte + case 220 => 0xfc.toByte + case 217 => 0xfd.toByte + case 218 => 0xfe.toByte + case 159 => 0xff.toByte + case _ => 0x40.toByte // defaults to space if mapping not available. } } diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageCommon.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageCommon.scala index 54d6f38a..a8854bd8 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageCommon.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageCommon.scala @@ -21,7 +21,7 @@ package za.co.absa.cobrix.cobol.parser.encoding.codepage * * It is an "invariant" subset of EBCDIC. Each converted symbol should be present in all EBCDIC pages. */ -class CodePageCommon extends SingleByteCodePage(CodePageCommon.ebcdicToAsciiMapping) { +class CodePageCommon extends SingleByteCodePage(CodePageCommon.ebcdicToAsciiMapping, Some(CodePageCommon.asciiToEbcdicMapping)) { override def codePageShortName: String = "common" } @@ -56,27 +56,28 @@ object CodePageCommon { ebcdic2ascii } + private val data = Array[Byte]( + 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x0D.toByte, 0x00.toByte, 0x00.toByte, 0x25.toByte, 0x00.toByte, 0x00.toByte, // 0 - 15 + 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 16 - 31 + 0x40.toByte, 0x5A.toByte, 0x7F.toByte, 0x7B.toByte, 0x5B.toByte, 0x6C.toByte, 0x50.toByte, 0x7D.toByte, 0x4D.toByte, 0x5D.toByte, 0x5C.toByte, 0x4E.toByte, 0x6B.toByte, 0x60.toByte, 0x4B.toByte, 0x61.toByte, // 32 - 47 + 0xF0.toByte, 0xF1.toByte, 0xF2.toByte, 0xF3.toByte, 0xF4.toByte, 0xF5.toByte, 0xF6.toByte, 0xF7.toByte, 0xF8.toByte, 0xF9.toByte, 0x7A.toByte, 0x5E.toByte, 0x4C.toByte, 0x7E.toByte, 0x6E.toByte, 0x6F.toByte, // 48 - 63 + 0x7C.toByte, 0xC1.toByte, 0xC2.toByte, 0xC3.toByte, 0xC4.toByte, 0xC5.toByte, 0xC6.toByte, 0xC7.toByte, 0xC8.toByte, 0xC9.toByte, 0xD1.toByte, 0xD2.toByte, 0xD3.toByte, 0xD4.toByte, 0xD5.toByte, 0xD6.toByte, // 64 - 79 + 0xD7.toByte, 0xD8.toByte, 0xD9.toByte, 0xE2.toByte, 0xE3.toByte, 0xE4.toByte, 0xE5.toByte, 0xE6.toByte, 0xE7.toByte, 0xE8.toByte, 0xE9.toByte, 0xBA.toByte, 0xE0.toByte, 0xBB.toByte, 0xB0.toByte, 0x6D.toByte, // 80 - 95 + 0x79.toByte, 0x81.toByte, 0x82.toByte, 0x83.toByte, 0x84.toByte, 0x85.toByte, 0x86.toByte, 0x87.toByte, 0x88.toByte, 0x89.toByte, 0x91.toByte, 0x92.toByte, 0x93.toByte, 0x94.toByte, 0x95.toByte, 0x96.toByte, // 96 - 111 + 0x97.toByte, 0x98.toByte, 0x99.toByte, 0xA2.toByte, 0xA3.toByte, 0xA4.toByte, 0xA5.toByte, 0xA6.toByte, 0xA7.toByte, 0xA8.toByte, 0xA9.toByte, 0xC0.toByte, 0x6A.toByte, 0xD0.toByte, 0xA1.toByte, 0x00.toByte, // 112 - 127 + 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 128 - 143 + 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 144 - 159 + 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 160 - 175 + 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 176 - 191 + 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 192 - 207 + 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 208 - 223 + 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 224 - 239 + 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte // 240 - 255 + ) /** * This is the table for converting basic ASCII symbols to EBCDIC common code page */ - def asciiToEbcdicMapping: Array[Byte] = { - Array[Byte]( - 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x0D.toByte, 0x00.toByte, 0x00.toByte, 0x25.toByte, 0x00.toByte, 0x00.toByte, // 0 - 15 - 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 16 - 31 - 0x40.toByte, 0x5A.toByte, 0x7F.toByte, 0x7B.toByte, 0x5B.toByte, 0x6C.toByte, 0x50.toByte, 0x7D.toByte, 0x4D.toByte, 0x5D.toByte, 0x5C.toByte, 0x4E.toByte, 0x6B.toByte, 0x60.toByte, 0x4B.toByte, 0x61.toByte, // 32 - 47 - 0xF0.toByte, 0xF1.toByte, 0xF2.toByte, 0xF3.toByte, 0xF4.toByte, 0xF5.toByte, 0xF6.toByte, 0xF7.toByte, 0xF8.toByte, 0xF9.toByte, 0x7A.toByte, 0x5E.toByte, 0x4C.toByte, 0x7E.toByte, 0x6E.toByte, 0x6F.toByte, // 48 - 63 - 0x7C.toByte, 0xC1.toByte, 0xC2.toByte, 0xC3.toByte, 0xC4.toByte, 0xC5.toByte, 0xC6.toByte, 0xC7.toByte, 0xC8.toByte, 0xC9.toByte, 0xD1.toByte, 0xD2.toByte, 0xD3.toByte, 0xD4.toByte, 0xD5.toByte, 0xD6.toByte, // 64 - 79 - 0xD7.toByte, 0xD8.toByte, 0xD9.toByte, 0xE2.toByte, 0xE3.toByte, 0xE4.toByte, 0xE5.toByte, 0xE6.toByte, 0xE7.toByte, 0xE8.toByte, 0xE9.toByte, 0xBA.toByte, 0xE0.toByte, 0xBB.toByte, 0xB0.toByte, 0x6D.toByte, // 80 - 95 - 0x79.toByte, 0x81.toByte, 0x82.toByte, 0x83.toByte, 0x84.toByte, 0x85.toByte, 0x86.toByte, 0x87.toByte, 0x88.toByte, 0x89.toByte, 0x91.toByte, 0x92.toByte, 0x93.toByte, 0x94.toByte, 0x95.toByte, 0x96.toByte, // 96 - 111 - 0x97.toByte, 0x98.toByte, 0x99.toByte, 0xA2.toByte, 0xA3.toByte, 0xA4.toByte, 0xA5.toByte, 0xA6.toByte, 0xA7.toByte, 0xA8.toByte, 0xA9.toByte, 0xC0.toByte, 0x6A.toByte, 0xD0.toByte, 0xA1.toByte, 0x00.toByte, // 112 - 127 - 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 128 - 143 - 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 144 - 159 - 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 160 - 175 - 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 176 - 191 - 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 192 - 207 - 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 208 - 223 - 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 224 - 239 - 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte // 240 - 255 - ) + def asciiToEbcdicMapping: Int => Byte = (y : Int) => { + data.applyOrElse(y, (x : Int) => 0x40.toByte) } } diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/SingleByteCodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/SingleByteCodePage.scala index 4afcf266..d25aa0ef 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/SingleByteCodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/SingleByteCodePage.scala @@ -16,10 +16,13 @@ package za.co.absa.cobrix.cobol.parser.encoding.codepage +import java.util + /** * The base class for all single-byte EBCDIC decoders. */ -abstract class SingleByteCodePage(ebcdicToAsciiMapping: Array[Char]) extends CodePage { +abstract class SingleByteCodePage(ebcdicToAsciiMapping: Array[Char], asciiToEbcdicMapping: Option[Int => Byte]=None) + extends CodePage { private val ConversionTableElements = 256 private val conversionTable = ebcdicToAsciiMapping @@ -40,4 +43,32 @@ abstract class SingleByteCodePage(ebcdicToAsciiMapping: Array[Char]) extends Cod } buf.toString } + + /** + * An encoder from a ASCII basic string to an EBCDIC byte array + * + * @param string An input string + * @param length The length of the output (in bytes) + * @return A string representation of the binary data + */ + def convert(string: String, length: Int): Array[Byte] = { + require(length >= 0, s"Field length cannot be negative, got $length") + require(asciiToEbcdicMapping.isDefined, s"Cannot encode strings for Code Page without ASCII to EBCDIC " + + s"mapping ${this.getClass.getSimpleName}") + + var i = 0 + val buf = new Array[Byte](length) + + // PIC X fields are space-filled on mainframe. Use EBCDIC space 0x40. + util.Arrays.fill(buf, 0x40.toByte) + + while (i < string.length && i < length) { + val unicodeCodePoint: Int = string.codePointAt(i) + buf(i) = asciiToEbcdicMapping.get(unicodeCodePoint) + i = i + 1 + } + buf + } + + override def supportsEncoding: Boolean = asciiToEbcdicMapping.isDefined } diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/TwoByteCodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/TwoByteCodePage.scala index faa0ea68..5d9625d9 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/TwoByteCodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/TwoByteCodePage.scala @@ -90,6 +90,12 @@ abstract class TwoByteCodePage(ebcdicToAsciiMapping: Array[Char]) extends CodePa } buf.toString } + + override def convert(string: String, length: Int): Array[Byte] = throw new IllegalStateException(s"Cannot encode " + + s"strings for Code Page without ASCII to EBCDIC mapping ${this.getClass.getSimpleName}, did you forget to call" + + s"supportsEncoding?") + + override def supportsEncoding: Boolean = false } object TwoByteCodePage { diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala index 6e618425..e8698b66 100644 --- a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/writer/FixedLengthEbcdicWriterSuite.scala @@ -78,6 +78,52 @@ class FixedLengthEbcdicWriterSuite extends AnyWordSpec with SparkTestBase with B } } + + "write simple fixed-record-length EBCDIC data files using code page 1144" in { + withTempDirectory("cobol_writer1") { tempDir => + val df = List(("A", "F|rst"), ("B", "S€nd"), ("C", "Last]")).toDF("A", "B") + + val path = new Path(tempDir, "writer1") + + df.coalesce(1) + .orderBy("A") + .write + .format("cobol") + .mode(SaveMode.Overwrite) + .option("copybook_contents", copybookContents) + .option("ebcdic_code_page", "cp1144") + .save(path.toString) + + val fs = path.getFileSystem(spark.sparkContext.hadoopConfiguration) + + assert(fs.exists(path), "Output directory should exist") + val files = fs.listStatus(path) + .filter(_.getPath.getName.startsWith("part-")) + assert(files.nonEmpty, "Output directory should contain part files") + + val partFile = files.head.getPath + val data = fs.open(partFile) + val bytes = new Array[Byte](files.head.getLen.toInt) + data.readFully(bytes) + data.close() + + // Expected EBCDIC data for sample test data + val expected = Array[Byte]( + 0xC1.toByte, 0xC6.toByte, 0xBB.toByte, 0x99.toByte, 0xa2.toByte, 0xa3.toByte, // A,F|rst + 0xC2.toByte, 0xE2.toByte, 0x9F.toByte, 0x95.toByte, 0x84.toByte, 0x40.toByte, // B,S€nd_ + 0xC3.toByte, 0xD3.toByte, 0x81.toByte, 0xa2.toByte, 0xa3.toByte, 0x51.toByte // C,Last] + ) + + if (!bytes.sameElements(expected)) { + println(s"Expected bytes: ${expected.map("%02X" format _).mkString(" ")}") + println(s"Actual bytes: ${bytes.map("%02X" format _).mkString(" ")}") + + assert(bytes.sameElements(expected), "Written data should match expected EBCDIC1144 encoding") + } + } + } + + "write data frames with different field order and null values" in { withTempDirectory("cobol_writer1") { tempDir => val df = List((1, "First", "A"), (2, "Scnd", "B"), (3, null, "C")).toDF("C", "B", "A")