Skip to content

Commit c34b84e

Browse files
committed
Remove unused conversion code from mbstring
Over the last few years, I refactored mbstring to perform encoding conversion a buffer at a time, rather than a single byte at a time. This resulted in a huge performance increase. After the refactoring, the old "byte-at-a-time" code was retained for two reasons: 1) It was used by the mailparse PECL extension. 2) It was used to implement mb_strcut for some text encodings. However, after reviewing mailparse's use of mbstring, it is clear that mailparse only relies on mbstring for decoding of QPrint, and possibly Base64. It does not use the byte-at-a-time conversion code for any other encoding. Further, mb_strcut only relies on the byte-at-a-time conversion code for a limited number of legacy text encodings, such as ISO-2022-JP, HZ, UTF-7, etc. Hence, we can remove over 5000 lines of unused code without breaking anything. This will help to reduce binary size, and make the mbstring codebase easier to navigate for new contributors.
1 parent 11bec6b commit c34b84e

14 files changed

+608
-5451
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cjk.c

Lines changed: 572 additions & 4031 deletions
Large diffs are not rendered by default.

ext/mbstring/libmbfl/filters/mbfilter_cjk.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,4 @@ int mbfilter_sjis_emoji_docomo2unicode(int s, int *snd);
4242
int mbfilter_sjis_emoji_kddi2unicode(int s, int *snd);
4343
int mbfilter_sjis_emoji_sb2unicode(int s, int *snd);
4444

45-
int mbfilter_unicode2sjis_emoji_docomo(int c, int *s1, mbfl_convert_filter *filter);
46-
int mbfilter_unicode2sjis_emoji_kddi_sjis(int c, int *s1, mbfl_convert_filter *filter);
47-
int mbfilter_unicode2sjis_emoji_sb(int c, int *s1, mbfl_convert_filter *filter);
48-
4945
#endif /* MBFL_MBFILTER_CJK_H */

ext/mbstring/libmbfl/filters/mbfilter_cp51932.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,5 @@
3333
#include "mbfilter.h"
3434

3535
extern const mbfl_encoding mbfl_encoding_cp51932;
36-
extern const struct mbfl_convert_vtbl vtbl_cp51932_wchar;
37-
extern const struct mbfl_convert_vtbl vtbl_wchar_cp51932;
38-
39-
int mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter);
40-
int mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter);
4136

4237
#endif /* MBFL_MBFILTER_CP51932_H */

ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c

Lines changed: 2 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -21,70 +21,19 @@ static inline uint32_t coalesce(uint32_t a, uint32_t b)
2121
return a ? a : b;
2222
}
2323

24-
/* Helper for single-byte encodings which use a conversion table */
25-
static int mbfl_conv_singlebyte_table(int c, mbfl_convert_filter *filter, int tbl_min, const unsigned short tbl[])
26-
{
27-
if (c >= 0 && c < tbl_min) {
28-
CK((*filter->output_function)(c, filter->data));
29-
} else if (c < 0) {
30-
CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data));
31-
} else {
32-
CK((*filter->output_function)(coalesce(tbl[c - tbl_min], MBFL_BAD_INPUT), filter->data));
33-
}
34-
return 0;
35-
}
36-
37-
static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int tbl_min, const unsigned short tbl[])
38-
{
39-
if (c >= 0 && c < tbl_min) {
40-
CK((*filter->output_function)(c, filter->data));
41-
} else if (c < 0 || c == MBFL_BAD_INPUT) {
42-
CK(mbfl_filt_conv_illegal_output(c, filter));
43-
} else {
44-
for (int i = 0; i < 256 - tbl_min; i++) {
45-
if (c == tbl[i]) {
46-
CK((*filter->output_function)(i + tbl_min, filter->data));
47-
return 0;
48-
}
49-
}
50-
CK(mbfl_filt_conv_illegal_output(c, filter));
51-
}
52-
return 0;
53-
}
54-
5524
/* Initialize data structures for a single-byte encoding */
5625
#define DEF_SB(id, name, mime_name, aliases) \
57-
static int mbfl_filt_conv_##id##_wchar(int c, mbfl_convert_filter *filter); \
58-
static int mbfl_filt_conv_wchar_##id(int c, mbfl_convert_filter *filter); \
5926
static size_t mb_##id##_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); \
6027
static void mb_wchar_to_##id(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); \
61-
static const struct mbfl_convert_vtbl vtbl_##id##_wchar = { \
62-
mbfl_no_encoding_##id, \
63-
mbfl_no_encoding_wchar, \
64-
mbfl_filt_conv_common_ctor, \
65-
NULL, \
66-
mbfl_filt_conv_##id##_wchar, \
67-
mbfl_filt_conv_common_flush, \
68-
NULL \
69-
}; \
70-
static const struct mbfl_convert_vtbl vtbl_wchar_##id = { \
71-
mbfl_no_encoding_wchar, \
72-
mbfl_no_encoding_##id, \
73-
mbfl_filt_conv_common_ctor, \
74-
NULL, \
75-
mbfl_filt_conv_wchar_##id, \
76-
mbfl_filt_conv_common_flush, \
77-
NULL \
78-
}; \
7928
const mbfl_encoding mbfl_encoding_##id = { \
8029
mbfl_no_encoding_##id, \
8130
name, \
8231
mime_name, \
8332
aliases, \
8433
NULL, \
8534
MBFL_ENCTYPE_SBCS, \
86-
&vtbl_##id##_wchar, \
87-
&vtbl_wchar_##id, \
35+
NULL, \
36+
NULL, \
8837
mb_##id##_to_wchar, \
8938
mb_wchar_to_##id, \
9039
NULL, \
@@ -93,12 +42,6 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
9342

9443
/* For single-byte encodings which use a conversion table */
9544
#define DEF_SB_TBL(id, name, mime_name, aliases, tbl_min, tbl) \
96-
static int mbfl_filt_conv_##id##_wchar(int c, mbfl_convert_filter *filter) { \
97-
return mbfl_conv_singlebyte_table(c, filter, tbl_min, tbl); \
98-
} \
99-
static int mbfl_filt_conv_wchar_##id(int c, mbfl_convert_filter *filter) { \
100-
return mbfl_conv_reverselookup_table(c, filter, tbl_min, tbl); \
101-
} \
10245
static size_t mb_##id##_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) \
10346
{ \
10447
unsigned char *p = *in, *e = p + *in_len; \
@@ -140,22 +83,6 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
14083
static const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "IBM-367", "cp367", "csASCII", NULL};
14184
DEF_SB(ascii, "ASCII", "US-ASCII", ascii_aliases);
14285

143-
static int mbfl_filt_conv_ascii_wchar(int c, mbfl_convert_filter *filter)
144-
{
145-
CK((*filter->output_function)((c < 0x80) ? c : MBFL_BAD_INPUT, filter->data));
146-
return 0;
147-
}
148-
149-
static int mbfl_filt_conv_wchar_ascii(int c, mbfl_convert_filter *filter)
150-
{
151-
if (c >= 0 && c < 0x80 && c != MBFL_BAD_INPUT) {
152-
CK((*filter->output_function)(c, filter->data));
153-
} else {
154-
CK(mbfl_filt_conv_illegal_output(c, filter));
155-
}
156-
return 0;
157-
}
158-
15986
static size_t mb_ascii_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
16087
{
16188
unsigned char *p = *in, *e = p + *in_len;
@@ -195,21 +122,6 @@ static void mb_wchar_to_ascii(uint32_t *in, size_t len, mb_convert_buf *buf, boo
195122
static const char *iso8859_1_aliases[] = {"ISO8859-1", "latin1", NULL};
196123
DEF_SB(8859_1, "ISO-8859-1", "ISO-8859-1", iso8859_1_aliases);
197124

198-
static int mbfl_filt_conv_8859_1_wchar(int c, mbfl_convert_filter *filter)
199-
{
200-
return (*filter->output_function)(c, filter->data);
201-
}
202-
203-
static int mbfl_filt_conv_wchar_8859_1(int c, mbfl_convert_filter *filter)
204-
{
205-
if (c >= 0 && c < 0x100 && c != MBFL_BAD_INPUT) {
206-
CK((*filter->output_function)(c, filter->data));
207-
} else {
208-
CK(mbfl_filt_conv_illegal_output(c, filter));
209-
}
210-
return 0;
211-
}
212-
213125
static size_t mb_8859_1_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
214126
{
215127
unsigned char *p = *in, *e = p + *in_len;
@@ -494,38 +406,6 @@ static const unsigned short cp1252_ucs_table[] = {
494406
};
495407
DEF_SB(cp1252, "Windows-1252", "Windows-1252", cp1252_aliases);
496408

497-
static int mbfl_filt_conv_wchar_cp1252(int c, mbfl_convert_filter *filter)
498-
{
499-
if (c < 0 || c == MBFL_BAD_INPUT) {
500-
CK(mbfl_filt_conv_illegal_output(c, filter));
501-
} else if (c >= 0x100) {
502-
for (int n = 0; n < 32; n++) {
503-
if (c == cp1252_ucs_table[n]) {
504-
CK((*filter->output_function)(0x80 + n, filter->data));
505-
return 0;
506-
}
507-
}
508-
CK(mbfl_filt_conv_illegal_output(c, filter));
509-
} else if (c <= 0x7F || c >= 0xA0 || c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) {
510-
CK((*filter->output_function)(c, filter->data));
511-
} else {
512-
CK(mbfl_filt_conv_illegal_output(c, filter));
513-
}
514-
return 0;
515-
}
516-
517-
static int mbfl_filt_conv_cp1252_wchar(int c, mbfl_convert_filter *filter)
518-
{
519-
int s;
520-
if (c >= 0x80 && c < 0xA0) {
521-
s = coalesce(cp1252_ucs_table[c - 0x80], MBFL_BAD_INPUT);
522-
} else {
523-
s = c;
524-
}
525-
CK((*filter->output_function)(s, filter->data));
526-
return 0;
527-
}
528-
529409
static size_t mb_cp1252_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
530410
{
531411
unsigned char *p = *in, *e = p + *in_len;
@@ -701,32 +581,6 @@ static const unsigned char ucs_armscii8_table[] = {
701581
};
702582
DEF_SB(armscii8, "ArmSCII-8", "ArmSCII-8", armscii8_aliases);
703583

704-
static int mbfl_filt_conv_armscii8_wchar(int c, mbfl_convert_filter *filter)
705-
{
706-
CK((*filter->output_function)((c < 0xA0) ? c : coalesce(armscii8_ucs_table[c - 0xA0], MBFL_BAD_INPUT), filter->data));
707-
return 0;
708-
}
709-
710-
static int mbfl_filt_conv_wchar_armscii8(int c, mbfl_convert_filter *filter)
711-
{
712-
if (c >= 0x28 && c <= 0x2F) {
713-
CK((*filter->output_function)(ucs_armscii8_table[c - 0x28], filter->data));
714-
} else if (c < 0 || c == MBFL_BAD_INPUT) {
715-
CK(mbfl_filt_conv_illegal_output(c, filter));
716-
} else if (c < 0xA0) {
717-
CK((*filter->output_function)(c, filter->data));
718-
} else {
719-
for (int n = 0; n < 0x60; n++) {
720-
if (c == armscii8_ucs_table[n]) {
721-
CK((*filter->output_function)(0xA0 + n, filter->data));
722-
return 0;
723-
}
724-
}
725-
CK(mbfl_filt_conv_illegal_output(c, filter));
726-
}
727-
return 0;
728-
}
729-
730584
static size_t mb_armscii8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
731585
{
732586
unsigned char *p = *in, *e = p + *in_len;

0 commit comments

Comments
 (0)