Skip to content

Commit 97beecc

Browse files
committed
Add identify filter for UTF-16, UTF-16LE, UTF-16BE
There was one faulty test in the suite which only passed before because UTF-16 had no identify filter. After this was fixed, it exposed the problem with the test.
1 parent 6af54d8 commit 97beecc

File tree

5 files changed

+135
-12
lines changed

5 files changed

+135
-12
lines changed

ext/mbstring/libmbfl/filters/mbfilter_utf16.c

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030
#include "mbfilter.h"
3131
#include "mbfilter_utf16.h"
3232

33+
static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter);
34+
static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter);
35+
static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter);
36+
3337
static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
3438

3539
const mbfl_encoding mbfl_encoding_utf16 = {
@@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf16le = {
6569
&vtbl_wchar_utf16le
6670
};
6771

72+
const struct mbfl_identify_vtbl vtbl_identify_utf16 = {
73+
mbfl_no_encoding_utf16,
74+
mbfl_filt_ident_common_ctor,
75+
mbfl_filt_ident_utf16
76+
};
77+
78+
const struct mbfl_identify_vtbl vtbl_identify_utf16le = {
79+
mbfl_no_encoding_utf16le,
80+
mbfl_filt_ident_common_ctor,
81+
mbfl_filt_ident_utf16le
82+
};
83+
84+
const struct mbfl_identify_vtbl vtbl_identify_utf16be = {
85+
mbfl_no_encoding_utf16be,
86+
mbfl_filt_ident_common_ctor,
87+
mbfl_filt_ident_utf16be
88+
};
89+
6890
const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
6991
mbfl_no_encoding_utf16,
7092
mbfl_no_encoding_wchar,
@@ -320,3 +342,108 @@ int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
320342

321343
return c;
322344
}
345+
346+
static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter)
347+
{
348+
if (filter->status == 0) {
349+
if (c >= 0xfe) { /* could be a byte-order mark */
350+
filter->status = c;
351+
} else {
352+
/* no byte-order mark at beginning of input; assume UTF-16BE */
353+
filter->filter_function = mbfl_filt_ident_utf16be;
354+
return (filter->filter_function)(c, filter);
355+
}
356+
} else {
357+
unsigned short n = (filter->status << 8) | c;
358+
filter->status = 0;
359+
360+
if (n == 0xfeff) {
361+
/* it was a big-endian byte-order mark */
362+
filter->filter_function = mbfl_filt_ident_utf16be;
363+
} else if (n == 0xfffe) {
364+
/* it was a little-endian byte-order mark */
365+
filter->filter_function = mbfl_filt_ident_utf16le;
366+
} else {
367+
/* it wasn't a byte-order mark */
368+
filter->filter_function = mbfl_filt_ident_utf16be;
369+
(filter->filter_function)(n >> 8, filter);
370+
return (filter->filter_function)(c, filter);
371+
}
372+
}
373+
return c;
374+
}
375+
376+
static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter)
377+
{
378+
switch (filter->status) {
379+
case 0: /* 1st byte */
380+
filter->status = 1;
381+
break;
382+
383+
case 1: /* 2nd byte */
384+
if ((c & 0xfc) == 0xd8) {
385+
/* Looks like a surrogate pair */
386+
filter->status = 2;
387+
} else if ((c & 0xfc) == 0xdc) {
388+
/* This is wrong; the second part of the surrogate pair has come first */
389+
filter->flag = 1;
390+
} else {
391+
filter->status = 0; /* Just an ordinary 2-byte character */
392+
}
393+
break;
394+
395+
case 2: /* 3rd byte */
396+
filter->status = 3;
397+
break;
398+
399+
case 3: /* 4th byte */
400+
if ((c & 0xfc) == 0xdc) {
401+
filter->status = 0;
402+
} else {
403+
filter->flag = 1; /* Surrogate pair wrongly encoded */
404+
}
405+
break;
406+
}
407+
408+
return c;
409+
}
410+
411+
static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter)
412+
{
413+
switch (filter->status) {
414+
case 0: /* 1st byte */
415+
if ((c & 0xfc) == 0xd8) {
416+
/* Looks like a surrogate pair */
417+
filter->status = 2;
418+
} else if ((c & 0xfc) == 0xdc) {
419+
/* This is wrong; the second part of the surrogate pair has come first */
420+
filter->flag = 1;
421+
} else {
422+
/* Just an ordinary 2-byte character */
423+
filter->status = 1;
424+
}
425+
break;
426+
427+
case 1: /* 2nd byte, not surrogate pair */
428+
filter->status = 0;
429+
break;
430+
431+
case 2: /* 2nd byte, surrogate pair */
432+
filter->status = 3;
433+
break;
434+
435+
case 3: /* 3rd byte, surrogate pair */
436+
if ((c & 0xfc) == 0xdc) {
437+
filter->status = 4;
438+
} else {
439+
filter->flag = 1; /* Surrogate pair wrongly encoded */
440+
}
441+
break;
442+
443+
case 4: /* 4th byte, surrogate pair */
444+
filter->status = 0;
445+
break;
446+
}
447+
448+
return c;
449+
}

ext/mbstring/libmbfl/filters/mbfilter_utf16.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333
extern const mbfl_encoding mbfl_encoding_utf16;
3434
extern const mbfl_encoding mbfl_encoding_utf16be;
3535
extern const mbfl_encoding mbfl_encoding_utf16le;
36+
extern const struct mbfl_identify_vtbl vtbl_identify_utf16;
37+
extern const struct mbfl_identify_vtbl vtbl_identify_utf16le;
38+
extern const struct mbfl_identify_vtbl vtbl_identify_utf16be;
3639
extern const struct mbfl_convert_vtbl vtbl_utf16_wchar;
3740
extern const struct mbfl_convert_vtbl vtbl_wchar_utf16;
3841
extern const struct mbfl_convert_vtbl vtbl_utf16be_wchar;

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -396,18 +396,6 @@ const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd
396396
}
397397
n--;
398398
}
399-
400-
/* fallback judge */
401-
if (!encoding) {
402-
n = identd->filter_list_size - 1;
403-
while (n >= 0) {
404-
filter = identd->filter_list[n];
405-
if (!filter->flag) {
406-
encoding = filter->encoding;
407-
}
408-
n--;
409-
}
410-
}
411399
}
412400

413401
return encoding;

ext/mbstring/libmbfl/mbfl/mbfl_ident.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,9 @@ static const struct mbfl_identify_vtbl *mbfl_identify_filter_list[] = {
160160
&vtbl_identify_cp50222,
161161
&vtbl_identify_gb18030,
162162
&vtbl_identify_7bit,
163+
&vtbl_identify_utf16,
164+
&vtbl_identify_utf16le,
165+
&vtbl_identify_utf16be,
163166
&vtbl_identify_false,
164167
NULL
165168
};

ext/mbstring/tests/mb_convert_encoding_failed_detection.phpt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
--TEST--
22
mb_convert_encoding() when encoding detection fails
3+
--INI--
4+
mbstring.strict_detection=1
35
--FILE--
46
<?php
57

0 commit comments

Comments
 (0)