diff --git a/CMakeLists.txt b/CMakeLists.txt index 9535e0a35..055b38eb6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -652,6 +652,9 @@ else(STDLIBS_HAVE_GETRPCBYNUMBER) endif(STDLIBS_HAVE_GETRPCBYNUMBER) cmake_pop_check_state() + +check_function_exists(wcwidth HAVE_WCWIDTH) + # # This requires the libraries we require, as ether_ntohost might be # in one of those libraries. That means we have to do this after @@ -1443,7 +1446,7 @@ set(NETDISSECT_SOURCE_LIST_C # # Replace missing functions # -foreach(FUNC strlcat strlcpy strsep getservent getopt_long) +foreach(FUNC strlcat strlcpy strsep getservent getopt_long wcwidth) string(TOUPPER ${FUNC} FUNC_UPPERCASE) set(HAVE_FUNC_UPPERCASE HAVE_${FUNC_UPPERCASE}) if(NOT ${HAVE_FUNC_UPPERCASE}) diff --git a/Makefile.in b/Makefile.in index fc102099c..3b0529b04 100644 --- a/Makefile.in +++ b/Makefile.in @@ -363,6 +363,8 @@ EXTRA_DIST = \ missing/strlcat.c \ missing/strlcpy.c \ missing/strsep.c \ + missing/wcwidth.c \ + missing/wcwidth.h \ mkdep \ packetdat.awk \ print-smb.c \ @@ -396,6 +398,8 @@ strlcpy.o: $(srcdir)/missing/strlcpy.c $(CC) $(FULL_CFLAGS) -o $@ -c $(srcdir)/missing/strlcpy.c strsep.o: $(srcdir)/missing/strsep.c $(CC) $(FULL_CFLAGS) -o $@ -c $(srcdir)/missing/strsep.c +wcwidth.o: $(srcdir)/missing/wcwidth.c + $(CC) $(FULL_CFLAGS) -o $@ -c $(srcdir)/missing/wcwidth.c install: all [ -d $(DESTDIR)$(bindir) ] || \ diff --git a/cmakeconfig.h.in b/cmakeconfig.h.in index b97e54171..63828a3d0 100644 --- a/cmakeconfig.h.in +++ b/cmakeconfig.h.in @@ -126,6 +126,9 @@ /* Define to 1 if you have the `wsockinit' function. */ #cmakedefine HAVE_WSOCKINIT 1 +/* Define to 1 if you have the `wcwidth' function. */ +#cmakedefine HAVE_WCWIDTH 1 + /* define if libpcap has yydebug */ #cmakedefine HAVE_YYDEBUG 1 diff --git a/configure.ac b/configure.ac index 68d80f114..32653a1c4 100644 --- a/configure.ac +++ b/configure.ac @@ -318,7 +318,7 @@ fi # AC_LBL_LIBRARY_NET -AC_REPLACE_FUNCS(strlcat strlcpy strsep getservent getopt_long) +AC_REPLACE_FUNCS(strlcat strlcpy strsep getservent getopt_long wcwidth) AC_CHECK_FUNCS(fork vfork) # diff --git a/missing/wcwidth.c b/missing/wcwidth.c new file mode 100644 index 000000000..a1d31fac0 --- /dev/null +++ b/missing/wcwidth.c @@ -0,0 +1,207 @@ +/* + * The following wcwidth implementation is based on Markus Kuhn's + * implementation and adapted. + * + * This is an implementation of wcwidth() and wcswidth() (defined in + * IEEE Std 1002.1-2001) for Unicode. + * + * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html + * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html + * + * In fixed-width output devices, Latin characters all occupy a single + * "cell" position of equal width, whereas ideographic CJK characters + * occupy two such cells. Interoperability between terminal-line + * applications and (teletype-style) character terminals using the + * UTF-8 encoding requires agreement on which character should advance + * the cursor by how many cell positions. No established formal + * standards exist at present on which Unicode character shall occupy + * how many cell positions on character terminals. These routines are + * a first attempt of defining such behavior based on simple rules + * applied to data provided by the Unicode Consortium. + * + * For some graphical characters, the Unicode standard explicitly + * defines a character-cell width via the definition of the East Asian + * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. + * In all these cases, there is no ambiguity about which width a + * terminal shall use. For characters in the East Asian Ambiguous (A) + * class, the width choice depends purely on a preference of backward + * compatibility with either historic CJK or Western practice. + * Choosing single-width for these characters is easy to justify as + * the appropriate long-term solution, as the CJK practice of + * displaying these characters as double-width comes from historic + * implementation simplicity (8-bit encoded characters were displayed + * single-width and 16-bit ones double-width, even for Greek, + * Cyrillic, etc.) and not any typographic considerations. + * + * Much less clear is the choice of width for the Not East Asian + * (Neutral) class. Existing practice does not dictate a width for any + * of these characters. It would nevertheless make sense + * typographically to allocate two character cells to characters such + * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be + * represented adequately with a single-width glyph. The following + * routines at present merely assign a single-cell width to all + * neutral characters, in the interest of simplicity. This is not + * entirely satisfactory and should be reconsidered before + * establishing a formal standard in this area. At the moment, the + * decision which Not East Asian (Neutral) characters should be + * represented by double-width glyphs cannot yet be answered by + * applying a simple rule from the Unicode database content. Setting + * up a proper standard for the behavior of UTF-8 character terminals + * will require a careful analysis not only of each Unicode character, + * but also of each presentation form, something the author of these + * routines has avoided to do so far. + * + * http://www.unicode.org/unicode/reports/tr11/ + * + * Markus Kuhn -- 2007-05-26 (Unicode 5.0) + * + * Permission to use, copy, modify, and distribute this software + * for any purpose and without fee is hereby granted. The author + * disclaims all warranties with regard to this software. + * + * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c + */ + +#include + +struct interval { + int first; + int last; +}; + +/* auxiliary function for binary search in interval table */ +static int bisearch(wchar_t ucs, const struct interval *table, int max) { + int min = 0; + int mid; + + if (ucs < table[0].first || ucs > table[max].last) + return 0; + while (max >= min) { + mid = (min + max) / 2; + if (ucs > table[mid].last) + min = mid + 1; + else if (ucs < table[mid].first) + max = mid - 1; + else + return 1; + } + + return 0; +} + + +/* The following two functions define the column width of an ISO 10646 + * character as follows: + * + * - The null character (U+0000) has a column width of 0. + * + * - Other C0/C1 control characters and DEL will lead to a return + * value of -1. + * + * - Non-spacing and enclosing combining characters (general + * category code Mn or Me in the Unicode database) have a + * column width of 0. + * + * - SOFT HYPHEN (U+00AD) has a column width of 1. + * + * - Other format characters (general category code Cf in the Unicode + * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. + * + * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) + * have a column width of 0. + * + * - Spacing characters in the East Asian Wide (W) or East Asian + * Full-width (F) category as defined in Unicode Technical + * Report #11 have a column width of 2. + * + * - All remaining characters (including all printable + * ISO 8859-1 and WGL4 characters, Unicode control characters, + * etc.) have a column width of 1. + * + * This implementation assumes that wchar_t characters are encoded + * in ISO 10646. + */ + +int wcwidth(wchar_t ucs) +{ + /* sorted list of non-overlapping intervals of non-spacing characters */ + /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ + static const struct interval combining[] = { + { 0x0300, 0x036F }, { 0x0483, 0x0486 }, { 0x0488, 0x0489 }, + { 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, + { 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0600, 0x0603 }, + { 0x0610, 0x0615 }, { 0x064B, 0x065E }, { 0x0670, 0x0670 }, + { 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, + { 0x070F, 0x070F }, { 0x0711, 0x0711 }, { 0x0730, 0x074A }, + { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x0901, 0x0902 }, + { 0x093C, 0x093C }, { 0x0941, 0x0948 }, { 0x094D, 0x094D }, + { 0x0951, 0x0954 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, + { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, + { 0x09E2, 0x09E3 }, { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, + { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, + { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC }, + { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD }, + { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C }, + { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, { 0x0B4D, 0x0B4D }, + { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, + { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, + { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, { 0x0CBC, 0x0CBC }, + { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD }, + { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D }, + { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 }, + { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E }, + { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC }, + { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 }, + { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E }, + { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 }, + { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 }, + { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 }, + { 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x135F, 0x135F }, + { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 }, + { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD }, + { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD }, + { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 }, + { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B }, + { 0x1A17, 0x1A18 }, { 0x1B00, 0x1B03 }, { 0x1B34, 0x1B34 }, + { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, { 0x1B42, 0x1B42 }, + { 0x1B6B, 0x1B73 }, { 0x1DC0, 0x1DCA }, { 0x1DFE, 0x1DFF }, + { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x2063 }, + { 0x206A, 0x206F }, { 0x20D0, 0x20EF }, { 0x302A, 0x302F }, + { 0x3099, 0x309A }, { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, + { 0xA825, 0xA826 }, { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F }, + { 0xFE20, 0xFE23 }, { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB }, + { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F }, + { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x1D167, 0x1D169 }, + { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD }, + { 0x1D242, 0x1D244 }, { 0xE0001, 0xE0001 }, { 0xE0020, 0xE007F }, + { 0xE0100, 0xE01EF } + }; + + /* test for 8-bit control characters */ + if (ucs == 0) + return 0; + if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0)) + return -1; + + /* binary search in table of non-spacing characters */ + if (bisearch(ucs, combining, + sizeof(combining) / sizeof(struct interval) - 1)) + return 0; + + /* if we arrive here, ucs is not a combining or C0/C1 control character */ + + return 1 + + (ucs >= 0x1100 && + (ucs <= 0x115f || /* Hangul Jamo init. consonants */ + ucs == 0x2329 || ucs == 0x232a || + (ucs >= 0x2e80 && ucs <= 0xa4cf && + ucs != 0x303f) || /* CJK ... Yi */ + (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */ + (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */ + (ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */ + (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */ + (ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */ + (ucs >= 0xffe0 && ucs <= 0xffe6) || + (ucs >= 0x20000 && ucs <= 0x2fffd) || + (ucs >= 0x30000 && ucs <= 0x3fffd))); +} diff --git a/missing/wcwidth.h b/missing/wcwidth.h new file mode 100644 index 000000000..3434934e0 --- /dev/null +++ b/missing/wcwidth.h @@ -0,0 +1,14 @@ +/* + * wcwidth() implementation for Windows + * Based on Markus Kuhn's public domain implementation + */ + +#ifndef WCWIDTH_H +#define WCWIDTH_H + +#include + +/* Determine the column width of a wide character */ +int wcwidth(wchar_t ucs); + +#endif /* WCWIDTH_H */ diff --git a/netdissect.h b/netdissect.h index 6663e9d6d..da229cfb6 100644 --- a/netdissect.h +++ b/netdissect.h @@ -231,6 +231,7 @@ struct netdissect_options { int ndo_Aflag; /* print packet only in ASCII observing TAB, * LF, CR and SPACE as graphical chars */ + int ndo_utf8; /* interpret ASCII output as UTF-8 */ int ndo_Hflag; /* dissect 802.11s draft mesh standard */ const char *ndo_protocol; /* protocol */ jmp_buf ndo_early_end; /* jmp_buf for setjmp()/longjmp() */ diff --git a/print-ascii.c b/print-ascii.c index f504d2f7f..ce1b560d9 100644 --- a/print-ascii.c +++ b/print-ascii.c @@ -44,6 +44,13 @@ #include +#include +#include + +#ifndef HAVE_WCWIDTH +#include "missing/wcwidth.h" +#endif + #include "netdissect-ctype.h" #include "netdissect.h" @@ -56,6 +63,69 @@ #define HEXDUMP_HEXSTUFF_PER_LINE \ (HEXDUMP_HEXSTUFF_PER_SHORT * HEXDUMP_SHORTS_PER_LINE) + +/* + * The blow is_utf8_printable is taken from ngrep + * + * Check if a UTF-8 character sequence is printable using standard library functions. + * Returns the number of bytes in the UTF-8 character if printable, 0 otherwise. + * Also returns the display width (1 or 2 columns) via the width_out parameter. + * + * This uses mbrtowc() to convert multi-byte UTF-8 to wide char, then iswprint() + * to check if it's printable, and wcwidth() to get the display width. + */ +static u_int is_utf8_printable(const unsigned char *s, size_t max_len, int *width_out) { + if (!s || max_len == 0) return 0; + + mbstate_t state = {0}; + wchar_t wc; + + size_t len = mbrtowc(&wc, (const char *)s, max_len, &state); + + /* Check for errors and incomplete sequences */ + if (len == (size_t)-1) { + /* Encoding error */ + return 0; + } + + if (len == (size_t)-2) { + /* Incomplete multi-byte sequence (need more bytes) */ + return 0; + } + + if (len == 0) { + /* Null character */ + return 0; + } + + /* Check if the wide character is printable */ +#if defined(_WIN32) || defined(_WIN64) + /* Windows iswprint() is too conservative - be more permissive for UTF-8 */ + /* Accept any valid UTF-8 character that's not a control character */ + int is_printable = iswprint(wc) || + (wc >= 0x80 && wc < 0xD800) || /* Most of BMP except surrogates */ + (wc >= 0xE000 && wc < 0x110000); /* Private use + supplementary planes */ + + /* But exclude actual control characters */ + if (wc < 0x20 || (wc >= 0x7F && wc < 0xA0)) { + is_printable = 0; + } +#else + int is_printable = iswprint(wc); +#endif + + if (is_printable) { + /* Get display width (1 for normal chars, 2 for wide chars like CJK, 0 for combining) */ + int w = wcwidth(wc); + if (w < 0) w = 1; /* Treat non-printable/control as width 1 */ + /* Note: wcwidth returns 0 for combining characters, which is correct */ + if (width_out) *width_out = w; + return (u_int)len; + } + + return 0; +} + void ascii_print(netdissect_options *ndo, const u_char *cp, u_int length) @@ -71,28 +141,43 @@ ascii_print(netdissect_options *ndo, truncated = TRUE; } ND_PRINT("\n"); - while (length != 0) { - s = GET_U_1(cp); - cp++; - length--; - if (s == '\r') { - /* - * Don't print CRs at the end of the line; they - * don't belong at the ends of lines on UN*X, - * and the standard I/O library will give us one - * on Windows so we don't need to print one - * ourselves. - * - * In the middle of a line, just print a '.'. - */ - if (length > 1 && GET_U_1(cp) != '\n') - ND_PRINT("."); + + while (length > 0) { + int utf8_len; + int j; + + utf8_len = ndo->ndo_utf8 ? is_utf8_printable(cp, length, NULL) : 0; + + if (utf8_len > 0) { + /* Valid printable UTF-8 character */ + for (j = 0; j < utf8_len; j++) + ND_PRINT("%c", cp[j]); + cp += utf8_len; + length -= utf8_len; + } else { - if (!ND_ASCII_ISGRAPH(s) && - (s != '\t' && s != ' ' && s != '\n')) - ND_PRINT("."); - else - ND_PRINT("%c", s); + s = GET_U_1(cp); + cp++; + length--; + if (s == '\r') { + /* + * Don't print CRs at the end of the line; they + * don't belong at the ends of lines on UN*X, + * and the standard I/O library will give us one + * on Windows so we don't need to print one + * ourselves. + * + * In the middle of a line, just print a '.'. + */ + if (length > 1 && GET_U_1(cp) != '\n') + ND_PRINT("."); + } else { + if (!ND_ASCII_ISGRAPH(s) && + (s != '\t' && s != ' ' && s != '\n')) + ND_PRINT("."); + else + ND_PRINT("%c", s); + } } } if (truncated) @@ -104,52 +189,67 @@ hex_and_ascii_print_with_offset(netdissect_options *ndo, const char *indent, const u_char *cp, u_int length, u_int offset) { u_int caplength; - u_int i; - u_int s1, s2; - u_int nshorts; + u_int nbytes_unprinted; + u_int s1; int truncated = FALSE; char hexstuff[HEXDUMP_SHORTS_PER_LINE*HEXDUMP_HEXSTUFF_PER_SHORT+1], *hsp; - char asciistuff[ASCII_LINELENGTH+1], *asp; + char asciistuff[ASCII_LINELENGTH+1+4], *asp; + u_int utf8_bytes_to_skip = 0; caplength = ND_BYTES_AVAILABLE_AFTER(cp); if (length > caplength) { length = caplength; truncated = TRUE; } - nshorts = length / sizeof(u_short); - i = 0; + nbytes_unprinted = 0; hsp = hexstuff; asp = asciistuff; - while (nshorts != 0) { + while (length != 0) { s1 = GET_U_1(cp); + + // insert the leading space of short + if ((hsp - hexstuff) % HEXDUMP_HEXSTUFF_PER_SHORT == 0) { + (void)snprintf(hsp, sizeof(hexstuff) - (hsp - hexstuff), " "); + hsp++; + } + + // add the byte + (void)snprintf(hsp, sizeof(hexstuff) - (hsp - hexstuff), "%02x", s1); + hsp += 2; + + if (utf8_bytes_to_skip > 0) { + // only pad the new line + if (nbytes_unprinted == (u_int)(asp - asciistuff)) { + *(asp++) = ' '; + } + utf8_bytes_to_skip--; + } else { + // try to add the display (utf8) chars + utf8_bytes_to_skip = ndo->ndo_utf8 ? is_utf8_printable(cp, length, NULL) : 0; + if (utf8_bytes_to_skip > 0) { + u_int j; + for (j=0; j= HEXDUMP_SHORTS_PER_LINE) { + nbytes_unprinted++; + if (nbytes_unprinted >= (HEXDUMP_SHORTS_PER_LINE * sizeof(u_short))) { *hsp = *asp = '\0'; ND_PRINT("%s0x%04x: %-*s %s", indent, offset, HEXDUMP_HEXSTUFF_PER_LINE, hexstuff, asciistuff); - i = 0; hsp = hexstuff; asp = asciistuff; + nbytes_unprinted = 0; hsp = hexstuff; asp = asciistuff; offset += HEXDUMP_BYTES_PER_LINE; } - nshorts--; - } - if (length & 1) { - s1 = GET_U_1(cp); - cp++; - (void)snprintf(hsp, sizeof(hexstuff) - (hsp - hexstuff), - " %02x", s1); - hsp += 3; - *(asp++) = (char)(ND_ASCII_ISGRAPH(s1) ? s1 : '.'); - ++i; + length--; } - if (i > 0) { + + if (nbytes_unprinted > 0) { *hsp = *asp = '\0'; ND_PRINT("%s0x%04x: %-*s %s", indent, offset, HEXDUMP_HEXSTUFF_PER_LINE, @@ -159,6 +259,7 @@ hex_and_ascii_print_with_offset(netdissect_options *ndo, const char *indent, nd_trunc_longjmp(ndo); } + void hex_and_ascii_print(netdissect_options *ndo, const char *indent, const u_char *cp, u_int length) diff --git a/tcpdump.c b/tcpdump.c index bf2df874d..6dadde51b 100644 --- a/tcpdump.c +++ b/tcpdump.c @@ -103,6 +103,7 @@ The Regents of the University of California. All rights reserved.\n"; #include #include #include +#include #ifdef _WIN32 #include #else @@ -668,6 +669,7 @@ show_remote_devices_and_exit(void) #define OPTION_LENGTHS 138 #define OPTION_TIME_T_SIZE 139 #define OPTION_SKIP 140 +#define OPTION_UTF8 141 static const struct option longopts[] = { { "buffer-size", required_argument, NULL, 'B' }, @@ -712,6 +714,7 @@ static const struct option longopts[] = { { "time-t-size", no_argument, NULL, OPTION_TIME_T_SIZE }, { "ip-oneline", no_argument, NULL, 'g' }, { "skip", required_argument, NULL, OPTION_SKIP }, + { "utf8", no_argument, NULL, OPTION_UTF8 }, { "version", no_argument, NULL, OPTION_VERSION }, { NULL, 0, NULL, 0 } }; @@ -728,6 +731,8 @@ static const struct option longopts[] = { #define IMMEDIATE_MODE_USAGE "" #endif +#define DISPLAY_UTF8_USAGE "[ --utf8 ] " + #ifndef _WIN32 /* Drop root privileges and chroot if necessary */ static void @@ -1631,6 +1636,9 @@ main(int argc, char **argv) memset(ndo, 0, sizeof(*ndo)); ndo_set_function_pointers(ndo); + setlocale(LC_CTYPE, ""); + + cnt = -1; device = NULL; infile = NULL; @@ -2094,6 +2102,10 @@ main(int argc, char **argv) optarg, NULL, 0, INT_MAX, 0); break; + case OPTION_UTF8: + ++ndo->ndo_utf8; + break; + #ifdef HAVE_PCAP_SET_TSTAMP_PRECISION case OPTION_TSTAMP_MICRO: ndo->ndo_tstamp_precision = PCAP_TSTAMP_PRECISION_MICRO; @@ -2125,6 +2137,32 @@ main(int argc, char **argv) /* NOTREACHED */ } + + if (ndo->ndo_utf8) { + +#if defined(_WIN32) + /* On Windows, explicitly set UTF-8 locale and console code page for mbrtowc() to work */ + /* Try multiple locale formats for compatibility across Windows versions */ + if (setlocale(LC_CTYPE, ".UTF-8") == NULL) { + if (setlocale(LC_CTYPE, ".UTF8") == NULL) { + if (setlocale(LC_CTYPE, "en_US.UTF-8") == NULL) { + setlocale(LC_CTYPE, "C.UTF-8"); + } + } + } + /* Also set console code page to UTF-8 (65001) */ + SetConsoleOutputCP(65001); + SetConsoleCP(65001); +#else + char const *locale = getenv("LANG"); + if (locale == NULL) + locale = "en_US"; + + setlocale(LC_CTYPE, locale); +#endif + + } + if (ndo->ndo_Aflag && ndo->ndo_xflag) error("-A and -x[x] are mutually exclusive."); if (ndo->ndo_Aflag && ndo->ndo_Xflag) @@ -3507,5 +3545,5 @@ print_usage(FILE *f) "\t\t[ --time-stamp-precision precision ] [ --micro ] [ --nano ]\n"); #endif (void)fprintf(f, -"\t\t" z_FLAG_USAGE "[ -Z user ] [ expression ]\n"); +"\t\t" DISPLAY_UTF8_USAGE z_FLAG_USAGE "[ -Z user ] [ expression ]\n"); }