Skip to content

Commit 87eadce

Browse files
gh-101828: Fix jisx0213 codecs removing null characters (gh-139340)
1 parent ded59f7 commit 87eadce

File tree

4 files changed

+34
-6
lines changed

4 files changed

+34
-6
lines changed

Lib/test/multibytecodec_support.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,23 @@ def test_incrementalencoder_del_segfault(self):
282282
with self.assertRaises(AttributeError):
283283
del e.errors
284284

285+
def test_null_terminator(self):
286+
# see gh-101828
287+
text = "フルーツ"
288+
try:
289+
text.encode(self.encoding)
290+
except UnicodeEncodeError:
291+
text = "Python is cool"
292+
encode_w_null = (text + "\0").encode(self.encoding)
293+
encode_plus_null = text.encode(self.encoding) + "\0".encode(self.encoding)
294+
self.assertTrue(encode_w_null.endswith(b'\x00'))
295+
self.assertEqual(encode_w_null, encode_plus_null)
296+
297+
encode_w_null_2 = (text + "\0" + text + "\0").encode(self.encoding)
298+
encode_plus_null_2 = encode_plus_null + encode_plus_null
299+
self.assertEqual(encode_w_null_2.count(b'\x00'), 2)
300+
self.assertEqual(encode_w_null_2, encode_plus_null_2)
301+
285302

286303
class TestBase_Mapping(unittest.TestCase):
287304
pass_enctest = []
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix ``'shift_jisx0213'``, ``'shift_jis_2004'``, ``'euc_jisx0213'`` and
2+
``'euc_jis_2004'`` codecs truncating null chars
3+
as they were treated as part of multi-character sequences.

Modules/cjkcodecs/_codecs_iso2022.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -802,10 +802,13 @@ jisx0213_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
802802
return coded;
803803

804804
case 2: /* second character of unicode pair */
805-
coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
806-
jisx0213_pair_encmap, JISX0213_ENCPAIRS);
807-
if (coded != DBCINV)
808-
return coded;
805+
if (data[1] != 0) { /* Don't consume null char as part of pair */
806+
coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
807+
jisx0213_pair_encmap, JISX0213_ENCPAIRS);
808+
if (coded != DBCINV) {
809+
return coded;
810+
}
811+
}
809812
_Py_FALLTHROUGH;
810813

811814
case -1: /* flush unterminated */

Modules/cjkcodecs/_codecs_jp.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,11 @@ ENCODER(euc_jis_2004)
192192
JISX0213_ENCPAIRS);
193193
if (code == DBCINV)
194194
return 1;
195-
} else
195+
}
196+
else if (c2 != 0) {
197+
/* Don't consume null char as part of pair */
196198
insize = 2;
199+
}
197200
}
198201
}
199202
}
@@ -611,8 +614,10 @@ ENCODER(shift_jis_2004)
611614
if (code == DBCINV)
612615
return 1;
613616
}
614-
else
617+
else if (ch2 != 0) {
618+
/* Don't consume null char as part of pair */
615619
insize = 2;
620+
}
616621
}
617622
}
618623
}

0 commit comments

Comments
 (0)