|
30 | 30 | #include "mbfilter.h"
|
31 | 31 | #include "mbfilter_utf16.h"
|
32 | 32 |
|
| 33 | +static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter); |
| 34 | +static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter); |
| 35 | +static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter); |
| 36 | + |
33 | 37 | static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
|
34 | 38 |
|
35 | 39 | const mbfl_encoding mbfl_encoding_utf16 = {
|
@@ -65,6 +69,24 @@ const mbfl_encoding mbfl_encoding_utf16le = {
|
65 | 69 | &vtbl_wchar_utf16le
|
66 | 70 | };
|
67 | 71 |
|
| 72 | +const struct mbfl_identify_vtbl vtbl_identify_utf16 = { |
| 73 | + mbfl_no_encoding_utf16, |
| 74 | + mbfl_filt_ident_common_ctor, |
| 75 | + mbfl_filt_ident_utf16 |
| 76 | +}; |
| 77 | + |
| 78 | +const struct mbfl_identify_vtbl vtbl_identify_utf16le = { |
| 79 | + mbfl_no_encoding_utf16le, |
| 80 | + mbfl_filt_ident_common_ctor, |
| 81 | + mbfl_filt_ident_utf16le |
| 82 | +}; |
| 83 | + |
| 84 | +const struct mbfl_identify_vtbl vtbl_identify_utf16be = { |
| 85 | + mbfl_no_encoding_utf16be, |
| 86 | + mbfl_filt_ident_common_ctor, |
| 87 | + mbfl_filt_ident_utf16be |
| 88 | +}; |
| 89 | + |
68 | 90 | const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
|
69 | 91 | mbfl_no_encoding_utf16,
|
70 | 92 | mbfl_no_encoding_wchar,
|
@@ -320,3 +342,108 @@ int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
|
320 | 342 |
|
321 | 343 | return c;
|
322 | 344 | }
|
| 345 | + |
| 346 | +static int mbfl_filt_ident_utf16(int c, mbfl_identify_filter *filter) |
| 347 | +{ |
| 348 | + if (filter->status == 0) { |
| 349 | + if (c >= 0xfe) { /* could be a byte-order mark */ |
| 350 | + filter->status = c; |
| 351 | + } else { |
| 352 | + /* no byte-order mark at beginning of input; assume UTF-16BE */ |
| 353 | + filter->filter_function = mbfl_filt_ident_utf16be; |
| 354 | + return (filter->filter_function)(c, filter); |
| 355 | + } |
| 356 | + } else { |
| 357 | + unsigned short n = (filter->status << 8) | c; |
| 358 | + filter->status = 0; |
| 359 | + |
| 360 | + if (n == 0xfeff) { |
| 361 | + /* it was a big-endian byte-order mark */ |
| 362 | + filter->filter_function = mbfl_filt_ident_utf16be; |
| 363 | + } else if (n == 0xfffe) { |
| 364 | + /* it was a little-endian byte-order mark */ |
| 365 | + filter->filter_function = mbfl_filt_ident_utf16le; |
| 366 | + } else { |
| 367 | + /* it wasn't a byte-order mark */ |
| 368 | + filter->filter_function = mbfl_filt_ident_utf16be; |
| 369 | + (filter->filter_function)(n >> 8, filter); |
| 370 | + return (filter->filter_function)(c, filter); |
| 371 | + } |
| 372 | + } |
| 373 | + return c; |
| 374 | +} |
| 375 | + |
| 376 | +static int mbfl_filt_ident_utf16le(int c, mbfl_identify_filter *filter) |
| 377 | +{ |
| 378 | + switch (filter->status) { |
| 379 | + case 0: /* 1st byte */ |
| 380 | + filter->status = 1; |
| 381 | + break; |
| 382 | + |
| 383 | + case 1: /* 2nd byte */ |
| 384 | + if ((c & 0xfc) == 0xd8) { |
| 385 | + /* Looks like a surrogate pair */ |
| 386 | + filter->status = 2; |
| 387 | + } else if ((c & 0xfc) == 0xdc) { |
| 388 | + /* This is wrong; the second part of the surrogate pair has come first */ |
| 389 | + filter->flag = 1; |
| 390 | + } else { |
| 391 | + filter->status = 0; /* Just an ordinary 2-byte character */ |
| 392 | + } |
| 393 | + break; |
| 394 | + |
| 395 | + case 2: /* 3rd byte */ |
| 396 | + filter->status = 3; |
| 397 | + break; |
| 398 | + |
| 399 | + case 3: /* 4th byte */ |
| 400 | + if ((c & 0xfc) == 0xdc) { |
| 401 | + filter->status = 0; |
| 402 | + } else { |
| 403 | + filter->flag = 1; /* Surrogate pair wrongly encoded */ |
| 404 | + } |
| 405 | + break; |
| 406 | + } |
| 407 | + |
| 408 | + return c; |
| 409 | +} |
| 410 | + |
| 411 | +static int mbfl_filt_ident_utf16be(int c, mbfl_identify_filter *filter) |
| 412 | +{ |
| 413 | + switch (filter->status) { |
| 414 | + case 0: /* 1st byte */ |
| 415 | + if ((c & 0xfc) == 0xd8) { |
| 416 | + /* Looks like a surrogate pair */ |
| 417 | + filter->status = 2; |
| 418 | + } else if ((c & 0xfc) == 0xdc) { |
| 419 | + /* This is wrong; the second part of the surrogate pair has come first */ |
| 420 | + filter->flag = 1; |
| 421 | + } else { |
| 422 | + /* Just an ordinary 2-byte character */ |
| 423 | + filter->status = 1; |
| 424 | + } |
| 425 | + break; |
| 426 | + |
| 427 | + case 1: /* 2nd byte, not surrogate pair */ |
| 428 | + filter->status = 0; |
| 429 | + break; |
| 430 | + |
| 431 | + case 2: /* 2nd byte, surrogate pair */ |
| 432 | + filter->status = 3; |
| 433 | + break; |
| 434 | + |
| 435 | + case 3: /* 3rd byte, surrogate pair */ |
| 436 | + if ((c & 0xfc) == 0xdc) { |
| 437 | + filter->status = 4; |
| 438 | + } else { |
| 439 | + filter->flag = 1; /* Surrogate pair wrongly encoded */ |
| 440 | + } |
| 441 | + break; |
| 442 | + |
| 443 | + case 4: /* 4th byte, surrogate pair */ |
| 444 | + filter->status = 0; |
| 445 | + break; |
| 446 | + } |
| 447 | + |
| 448 | + return c; |
| 449 | +} |
0 commit comments