Skip to content

Commit 641830c

Browse files
committed
Merge branch 'sk/mingw-uni-console'
* sk/mingw-uni-console: Win32: reliably detect console pipe handles Win32: fix broken pipe detection Win32: Thread-safe windows console output Win32: add Unicode conversion functions Win32: warn if the console font doesn't support Unicode Win32: detect console streams more reliably Win32: support Unicode console output
2 parents ba655d1 + 5182265 commit 641830c

File tree

3 files changed

+533
-123
lines changed

3 files changed

+533
-123
lines changed

compat/mingw.c

Lines changed: 91 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -865,9 +865,9 @@ static pid_t mingw_spawnve_fd(const char *cmd, const char **argv, char **env,
865865
memset(&si, 0, sizeof(si));
866866
si.cb = sizeof(si);
867867
si.dwFlags = STARTF_USESTDHANDLES;
868-
si.hStdInput = (HANDLE) _get_osfhandle(fhin);
869-
si.hStdOutput = (HANDLE) _get_osfhandle(fhout);
870-
si.hStdError = (HANDLE) _get_osfhandle(fherr);
868+
si.hStdInput = winansi_get_osfhandle(fhin);
869+
si.hStdOutput = winansi_get_osfhandle(fhout);
870+
si.hStdError = winansi_get_osfhandle(fherr);
871871

872872
/* concatenate argv, quoting args as we go */
873873
strbuf_init(&args, 0);
@@ -1848,6 +1848,91 @@ int mingw_offset_1st_component(const char *path)
18481848
return offset + is_dir_sep(path[offset]);
18491849
}
18501850

1851+
int xutftowcsn(wchar_t *wcs, const char *utfs, size_t wcslen, int utflen)
1852+
{
1853+
int upos = 0, wpos = 0;
1854+
const unsigned char *utf = (const unsigned char*) utfs;
1855+
if (!utf || !wcs || wcslen < 1) {
1856+
errno = EINVAL;
1857+
return -1;
1858+
}
1859+
/* reserve space for \0 */
1860+
wcslen--;
1861+
if (utflen < 0)
1862+
utflen = INT_MAX;
1863+
1864+
while (upos < utflen) {
1865+
int c = utf[upos++] & 0xff;
1866+
if (utflen == INT_MAX && c == 0)
1867+
break;
1868+
1869+
if (wpos >= wcslen) {
1870+
wcs[wpos] = 0;
1871+
errno = ERANGE;
1872+
return -1;
1873+
}
1874+
1875+
if (c < 0x80) {
1876+
/* ASCII */
1877+
wcs[wpos++] = c;
1878+
} else if (c >= 0xc2 && c < 0xe0 && upos < utflen &&
1879+
(utf[upos] & 0xc0) == 0x80) {
1880+
/* 2-byte utf-8 */
1881+
c = ((c & 0x1f) << 6);
1882+
c |= (utf[upos++] & 0x3f);
1883+
wcs[wpos++] = c;
1884+
} else if (c >= 0xe0 && c < 0xf0 && upos + 1 < utflen &&
1885+
!(c == 0xe0 && utf[upos] < 0xa0) && /* over-long encoding */
1886+
(utf[upos] & 0xc0) == 0x80 &&
1887+
(utf[upos + 1] & 0xc0) == 0x80) {
1888+
/* 3-byte utf-8 */
1889+
c = ((c & 0x0f) << 12);
1890+
c |= ((utf[upos++] & 0x3f) << 6);
1891+
c |= (utf[upos++] & 0x3f);
1892+
wcs[wpos++] = c;
1893+
} else if (c >= 0xf0 && c < 0xf5 && upos + 2 < utflen &&
1894+
wpos + 1 < wcslen &&
1895+
!(c == 0xf0 && utf[upos] < 0x90) && /* over-long encoding */
1896+
!(c == 0xf4 && utf[upos] >= 0x90) && /* > \u10ffff */
1897+
(utf[upos] & 0xc0) == 0x80 &&
1898+
(utf[upos + 1] & 0xc0) == 0x80 &&
1899+
(utf[upos + 2] & 0xc0) == 0x80) {
1900+
/* 4-byte utf-8: convert to \ud8xx \udcxx surrogate pair */
1901+
c = ((c & 0x07) << 18);
1902+
c |= ((utf[upos++] & 0x3f) << 12);
1903+
c |= ((utf[upos++] & 0x3f) << 6);
1904+
c |= (utf[upos++] & 0x3f);
1905+
c -= 0x10000;
1906+
wcs[wpos++] = 0xd800 | (c >> 10);
1907+
wcs[wpos++] = 0xdc00 | (c & 0x3ff);
1908+
} else if (c >= 0xa0) {
1909+
/* invalid utf-8 byte, printable unicode char: convert 1:1 */
1910+
wcs[wpos++] = c;
1911+
} else {
1912+
/* invalid utf-8 byte, non-printable unicode: convert to hex */
1913+
static const char *hex = "0123456789abcdef";
1914+
wcs[wpos++] = hex[c >> 4];
1915+
if (wpos < wcslen)
1916+
wcs[wpos++] = hex[c & 0x0f];
1917+
}
1918+
}
1919+
wcs[wpos] = 0;
1920+
return wpos;
1921+
}
1922+
1923+
int xwcstoutf(char *utf, const wchar_t *wcs, size_t utflen)
1924+
{
1925+
if (!wcs || !utf || utflen < 1) {
1926+
errno = EINVAL;
1927+
return -1;
1928+
}
1929+
utflen = WideCharToMultiByte(CP_UTF8, 0, wcs, -1, utf, utflen, NULL, NULL);
1930+
if (utflen)
1931+
return utflen - 1;
1932+
errno = ERANGE;
1933+
return -1;
1934+
}
1935+
18511936
void mingw_startup()
18521937
{
18531938
/* copy executable name to argv[0] */
@@ -1861,4 +1946,7 @@ void mingw_startup()
18611946
_setmode(_fileno(stdin), _O_BINARY);
18621947
_setmode(_fileno(stdout), _O_BINARY);
18631948
_setmode(_fileno(stderr), _O_BINARY);
1949+
1950+
/* initialize Unicode console */
1951+
winansi_init();
18641952
}

compat/mingw.h

Lines changed: 106 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -317,12 +317,8 @@ int mingw_raise(int sig);
317317
* ANSI emulation wrappers
318318
*/
319319

320-
int winansi_fputs(const char *str, FILE *stream);
321-
int winansi_printf(const char *format, ...) __attribute__((format (printf, 1, 2)));
322-
int winansi_fprintf(FILE *stream, const char *format, ...) __attribute__((format (printf, 2, 3)));
323-
#define fputs winansi_fputs
324-
#define printf(...) winansi_printf(__VA_ARGS__)
325-
#define fprintf(...) winansi_fprintf(__VA_ARGS__)
320+
void winansi_init(void);
321+
HANDLE winansi_get_osfhandle(int fd);
326322

327323
/*
328324
* git specific compatibility
@@ -355,6 +351,110 @@ void mingw_open_html(const char *path);
355351
char **make_augmented_environ(const char *const *vars);
356352
void free_environ(char **env);
357353

354+
/**
355+
* Converts UTF-8 encoded string to UTF-16LE.
356+
*
357+
* To support repositories with legacy-encoded file names, invalid UTF-8 bytes
358+
* 0xa0 - 0xff are converted to corresponding printable Unicode chars \u00a0 -
359+
* \u00ff, and invalid UTF-8 bytes 0x80 - 0x9f (which would make non-printable
360+
* Unicode) are converted to hex-code.
361+
*
362+
* Lead-bytes not followed by an appropriate number of trail-bytes, over-long
363+
* encodings and 4-byte encodings > \u10ffff are detected as invalid UTF-8.
364+
*
365+
* Maximum space requirement for the target buffer is two wide chars per UTF-8
366+
* char (((strlen(utf) * 2) + 1) [* sizeof(wchar_t)]).
367+
*
368+
* The maximum space is needed only if the entire input string consists of
369+
* invalid UTF-8 bytes in range 0x80-0x9f, as per the following table:
370+
*
371+
* | | UTF-8 | UTF-16 |
372+
* Code point | UTF-8 sequence | bytes | words | ratio
373+
* --------------+-------------------+-------+--------+-------
374+
* 000000-00007f | 0-7f | 1 | 1 | 1
375+
* 000080-0007ff | c2-df + 80-bf | 2 | 1 | 0.5
376+
* 000800-00ffff | e0-ef + 2 * 80-bf | 3 | 1 | 0.33
377+
* 010000-10ffff | f0-f4 + 3 * 80-bf | 4 | 2 (a) | 0.5
378+
* invalid | 80-9f | 1 | 2 (b) | 2
379+
* invalid | a0-ff | 1 | 1 | 1
380+
*
381+
* (a) encoded as UTF-16 surrogate pair
382+
* (b) encoded as two hex digits
383+
*
384+
* Note that, while the UTF-8 encoding scheme can be extended to 5-byte, 6-byte
385+
* or even indefinite-byte sequences, the largest valid code point \u10ffff
386+
* encodes as only 4 UTF-8 bytes.
387+
*
388+
* Parameters:
389+
* wcs: wide char target buffer
390+
* utf: string to convert
391+
* wcslen: size of target buffer (in wchar_t's)
392+
* utflen: size of string to convert, or -1 if 0-terminated
393+
*
394+
* Returns:
395+
* length of converted string (_wcslen(wcs)), or -1 on failure
396+
*
397+
* Errors:
398+
* EINVAL: one of the input parameters is invalid (e.g. NULL)
399+
* ERANGE: the output buffer is too small
400+
*/
401+
int xutftowcsn(wchar_t *wcs, const char *utf, size_t wcslen, int utflen);
402+
403+
/**
404+
* Simplified variant of xutftowcsn, assumes input string is \0-terminated.
405+
*/
406+
static inline int xutftowcs(wchar_t *wcs, const char *utf, size_t wcslen)
407+
{
408+
return xutftowcsn(wcs, utf, wcslen, -1);
409+
}
410+
411+
/**
412+
* Simplified file system specific variant of xutftowcsn, assumes output
413+
* buffer size is MAX_PATH wide chars and input string is \0-terminated,
414+
* fails with ENAMETOOLONG if input string is too long.
415+
*/
416+
static inline int xutftowcs_path(wchar_t *wcs, const char *utf)
417+
{
418+
int result = xutftowcsn(wcs, utf, MAX_PATH, -1);
419+
if (result < 0 && errno == ERANGE)
420+
errno = ENAMETOOLONG;
421+
return result;
422+
}
423+
424+
/**
425+
* Converts UTF-16LE encoded string to UTF-8.
426+
*
427+
* Maximum space requirement for the target buffer is three UTF-8 chars per
428+
* wide char ((_wcslen(wcs) * 3) + 1).
429+
*
430+
* The maximum space is needed only if the entire input string consists of
431+
* UTF-16 words in range 0x0800-0xd7ff or 0xe000-0xffff (i.e. \u0800-\uffff
432+
* modulo surrogate pairs), as per the following table:
433+
*
434+
* | | UTF-16 | UTF-8 |
435+
* Code point | UTF-16 sequence | words | bytes | ratio
436+
* --------------+-----------------------+--------+-------+-------
437+
* 000000-00007f | 0000-007f | 1 | 1 | 1
438+
* 000080-0007ff | 0080-07ff | 1 | 2 | 2
439+
* 000800-00ffff | 0800-d7ff / e000-ffff | 1 | 3 | 3
440+
* 010000-10ffff | d800-dbff + dc00-dfff | 2 | 4 | 2
441+
*
442+
* Note that invalid code points > 10ffff cannot be represented in UTF-16.
443+
*
444+
* Parameters:
445+
* utf: target buffer
446+
* wcs: wide string to convert
447+
* utflen: size of target buffer
448+
*
449+
* Returns:
450+
* length of converted string, or -1 on failure
451+
*
452+
* Errors:
453+
* EINVAL: one of the input parameters is invalid (e.g. NULL)
454+
* ERANGE: the output buffer is too small
455+
*/
456+
int xwcstoutf(char *utf, const wchar_t *wcs, size_t utflen);
457+
358458
/*
359459
* A critical section used in the implementation of the spawn
360460
* functions (mingw_spawnv[p]e()) and waitpid(). Intialised in

0 commit comments

Comments
 (0)