Skip to content

Commit b626e89

Browse files
committed
Fix conversion of ISO-2022-KR text (and add test suite)
- Truncated multi-byte characters are treated as an error - Truncated or unrecognized escape sequences are treated as an error - ASCII control characters are not allowed to appear in the middle of a multi-byte character
1 parent 658db1f commit b626e89

File tree

4 files changed

+8479
-90
lines changed

4 files changed

+8479
-90
lines changed

ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c

Lines changed: 85 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,18 @@
2727
*
2828
*/
2929

30+
/* ISO-2022-KR is defined in RFC 1557
31+
* The RFC says that _each_ line which uses KS X 1001 characters must start
32+
* with an escape sequence of ESC $ ) C
33+
* We don't enforce that for ISO-2022-KR input */
34+
3035
#include "mbfilter.h"
3136
#include "mbfilter_iso2022_kr.h"
3237
#include "unicode_table_uhc.h"
3338

39+
static int mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter *filter);
40+
static int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter);
41+
3442
const mbfl_encoding mbfl_encoding_2022kr = {
3543
mbfl_no_encoding_2022kr,
3644
"ISO-2022-KR",
@@ -58,61 +66,58 @@ const struct mbfl_convert_vtbl vtbl_2022kr_wchar = {
5866
mbfl_filt_conv_common_ctor,
5967
NULL,
6068
mbfl_filt_conv_2022kr_wchar,
61-
mbfl_filt_conv_common_flush,
69+
mbfl_filt_conv_2022kr_wchar_flush,
6270
NULL,
6371
};
6472

6573
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
6674

67-
/*
68-
* ISO-2022-KR => wchar
69-
*/
70-
int
71-
mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
75+
int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
7276
{
73-
int c1, w, flag;
77+
int w = 0;
7478

75-
retry:
7679
switch (filter->status & 0xf) {
77-
/* case 0x00: ASCII */
78-
/* case 0x10: KSC5601 */
80+
/* case 0x00: ASCII */
81+
/* case 0x10: KSC5601 */
7982
case 0:
8083
if (c == 0x1b) { /* ESC */
8184
filter->status += 2;
82-
} else if (c == 0x0f) { /* SI (ASCII) */
83-
filter->status &= ~0xff;
84-
} else if (c == 0x0e) { /* SO (KSC5601) */
85-
filter->status |= 0x10;
86-
} else if ((filter->status & 0x10) != 0 && c > 0x20 && c < 0x7f) {
85+
} else if (c == 0x0f) { /* shift in (ASCII) */
86+
filter->status = 0;
87+
} else if (c == 0x0e) { /* shift out (KSC5601) */
88+
filter->status = 0x10;
89+
} else if ((filter->status & 0x10) && c > 0x20 && c < 0x7f) {
8790
/* KSC5601 lead byte */
8891
filter->cache = c;
89-
filter->status += 1;
90-
} else if ((filter->status & 0x10) == 0 && c >= 0 && c < 0x80) {
92+
filter->status = 0x11;
93+
} else if ((filter->status & 0x10) == 0 && c >= 0 && c < 0x80) {
9194
/* latin, CTLs */
9295
CK((*filter->output_function)(c, filter->data));
9396
} else {
94-
w = c & MBFL_WCSGROUP_MASK;
95-
w |= MBFL_WCSGROUP_THROUGH;
96-
CK((*filter->output_function)(w, filter->data));
97+
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
9798
}
9899
break;
99100

100-
case 1: /* dbcs second byte */
101-
filter->status &= ~0xf;
102-
c1 = filter->cache;
103-
flag = 0;
101+
case 1: /* dbcs second byte */
102+
filter->status = 0x10;
103+
int c1 = filter->cache;
104+
int flag = 0;
105+
104106
if (c1 > 0x20 && c1 < 0x47) {
105107
flag = 1;
106108
} else if (c1 >= 0x47 && c1 <= 0x7e && c1 != 0x49) {
107109
flag = 2;
108110
}
111+
109112
if (flag > 0 && c > 0x20 && c < 0x7f) {
110-
if (flag == 1){
111-
w = (c1 - 0x21)*190 + (c - 0x41) + 0x80;
112-
if (w >= 0 && w < uhc2_ucs_table_size) {
113-
w = uhc2_ucs_table[w];
114-
} else {
115-
w = 0;
113+
if (flag == 1) {
114+
if (c1 != 0x22 || c <= 0x65) {
115+
w = (c1 - 0x21)*190 + (c - 0x41) + 0x80;
116+
if (w >= 0 && w < uhc2_ucs_table_size) {
117+
w = uhc2_ucs_table[w];
118+
} else {
119+
w = 0;
120+
}
116121
}
117122
} else {
118123
w = (c1 - 0x47)*94 + (c - 0x21);
@@ -124,54 +129,40 @@ mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
124129
}
125130

126131
if (w <= 0) {
127-
w = (c1 << 8) | c;
128-
w &= MBFL_WCSPLANE_MASK;
129-
w |= MBFL_WCSPLANE_KSC5601;
132+
w = (c1 << 8) | c | MBFL_WCSPLANE_KSC5601;
130133
}
131134
CK((*filter->output_function)(w, filter->data));
132-
} else if (c == 0x1b) { /* ESC */
133-
filter->status++;
134-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
135-
CK((*filter->output_function)(c, filter->data));
136135
} else {
137-
w = (c1 << 8) | c;
138-
w &= MBFL_WCSGROUP_MASK;
139-
w |= MBFL_WCSGROUP_THROUGH;
136+
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
140137
CK((*filter->output_function)(w, filter->data));
141138
}
142139
break;
143140

144-
case 2: /* ESC */
145-
if (c == 0x24) { /* '$' */
141+
case 2: /* ESC */
142+
if (c == '$') {
146143
filter->status++;
147144
} else {
148-
filter->status &= ~0xf;
149-
CK((*filter->output_function)(0x1b, filter->data));
150-
goto retry;
145+
filter->status = 0;
146+
CK((*filter->output_function)(0x1b | MBFL_WCSGROUP_THROUGH, filter->data));
151147
}
152148
break;
153-
case 3: /* ESC $ */
154-
if (c == 0x29) { /* ')' */
149+
150+
case 3: /* ESC $ */
151+
if (c == ')') {
155152
filter->status++;
156153
} else {
157-
filter->status &= ~0xf;
158-
CK((*filter->output_function)(0x1b, filter->data));
159-
CK((*filter->output_function)(0x24, filter->data));
160-
goto retry;
154+
filter->status = 0;
155+
CK((*filter->output_function)(0x1b24 | MBFL_WCSGROUP_THROUGH, filter->data));
161156
}
162157
break;
163-
case 4: /* ESC $ ) */
164-
if (c == 0x43) { /* 'C' */
165-
filter->status &= ~0xf;
166-
filter->status |= 0x100;
167-
} else {
168-
filter->status &= ~0xf;
169-
CK((*filter->output_function)(0x1b, filter->data));
170-
CK((*filter->output_function)(0x24, filter->data));
171-
CK((*filter->output_function)(0x29, filter->data));
172-
goto retry;
158+
159+
case 4: /* ESC $ ) */
160+
filter->status = 0;
161+
if (c != 'C') {
162+
CK((*filter->output_function)(0x1b2429 | MBFL_WCSGROUP_THROUGH, filter->data));
173163
}
174164
break;
165+
175166
default:
176167
filter->status = 0;
177168
break;
@@ -180,15 +171,23 @@ mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
180171
return c;
181172
}
182173

183-
/*
184-
* wchar => ISO-2022-KR
185-
*/
186-
int
187-
mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter)
174+
static int mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter *filter)
188175
{
189-
int c1, c2, s;
176+
if (filter->status & 0xF) {
177+
/* 2-byte character or escape sequence was truncated */
178+
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
179+
}
180+
181+
if (filter->flush_function) {
182+
(*filter->flush_function)(filter->data);
183+
}
184+
185+
return 0;
186+
}
190187

191-
s = 0;
188+
int mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter)
189+
{
190+
int c1, c2, s = 0;
192191

193192
if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) {
194193
s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min];
@@ -209,43 +208,41 @@ mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter)
209208
c1 = (s >> 8) & 0xff;
210209
c2 = s & 0xff;
211210
/* exclude UHC extension area */
212-
if (c1 < 0xa1 || c2 < 0xa1){
211+
if (c1 < 0xa1 || c2 < 0xa1) {
213212
s = c;
214213
}
214+
215215
if (s & 0x8000) {
216216
s -= 0x8080;
217217
}
218218

219219
if (s <= 0) {
220-
c1 = c & ~MBFL_WCSPLANE_MASK;
221-
if (c1 == MBFL_WCSPLANE_KSC5601) {
222-
s = c & MBFL_WCSPLANE_MASK;
223-
}
224220
if (c == 0) {
225221
s = 0;
226-
} else if (s <= 0) {
222+
} else {
227223
s = -1;
228224
}
229225
} else if ((s >= 0x80 && s < 0x2121) || (s > 0x8080)) {
230226
s = -1;
231227
}
228+
232229
if (s >= 0) {
233-
if (s < 0x80 && s > 0) { /* ASCII */
234-
if ((filter->status & 0x10) != 0) {
235-
CK((*filter->output_function)(0x0f, filter->data)); /* SI */
230+
if (s < 0x80 && s >= 0) { /* ASCII */
231+
if (filter->status & 0x10) {
232+
CK((*filter->output_function)(0x0f, filter->data)); /* shift in */
236233
filter->status &= ~0x10;
237234
}
238235
CK((*filter->output_function)(s, filter->data));
239236
} else {
240-
if ( (filter->status & 0x100) == 0) {
241-
CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
242-
CK((*filter->output_function)(0x24, filter->data)); /* '$' */
243-
CK((*filter->output_function)(0x29, filter->data)); /* ')' */
244-
CK((*filter->output_function)(0x43, filter->data)); /* 'C' */
237+
if ((filter->status & 0x100) == 0) {
238+
CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
239+
CK((*filter->output_function)('$', filter->data));
240+
CK((*filter->output_function)(')', filter->data));
241+
CK((*filter->output_function)('C', filter->data));
245242
filter->status |= 0x100;
246243
}
247244
if ((filter->status & 0x10) == 0) {
248-
CK((*filter->output_function)(0x0e, filter->data)); /* SO */
245+
CK((*filter->output_function)(0x0e, filter->data)); /* shift out */
249246
filter->status |= 0x10;
250247
}
251248
CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
@@ -258,17 +255,16 @@ mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter)
258255
return c;
259256
}
260257

261-
int
262-
mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter)
258+
static int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter)
263259
{
264260
/* back to ascii */
265-
if ((filter->status & 0xff00) != 0) {
266-
CK((*filter->output_function)(0x0f, filter->data)); /* SI */
261+
if (filter->status & 0xff00) {
262+
CK((*filter->output_function)(0x0f, filter->data)); /* shift in */
267263
}
268264

269-
filter->status &= 0xff;
265+
filter->status = filter->cache = 0;
270266

271-
if (filter->flush_function != NULL) {
267+
if (filter->flush_function) {
272268
return (*filter->flush_function)(filter->data);
273269
}
274270

ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,5 @@ extern const struct mbfl_convert_vtbl vtbl_2022kr_wchar;
3838

3939
int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter);
4040
int mbfl_filt_conv_wchar_2022kr(int c, mbfl_convert_filter *filter);
41-
int mbfl_filt_conv_any_2022kr_flush(mbfl_convert_filter *filter);
4241

4342
#endif /* MBFL_MBFILTER_ISO2022_KR_H */

0 commit comments

Comments
 (0)