@@ -135,19 +135,67 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
135135}
136136
137137static DWORD
138- _find_last_utf8_boundary (const char * buf , DWORD len )
138+ _find_last_utf8_boundary (const unsigned char * buf , DWORD len )
139139{
140- /* This function never returns 0, returns the original len instead */
141- DWORD count = 1 ;
142- if (len == 0 || (buf [len - 1 ] & 0x80 ) == 0 ) {
143- return len ;
144- }
145- for (;; count ++ ) {
146- if (count > 3 || count >= len ) {
140+ for (DWORD count = 1 ; count < 4 && count <= len ; count ++ ) {
141+ unsigned char c = buf [len - count ];
142+ if (c < 0x80 ) {
143+ /* No starting byte found. */
147144 return len ;
148145 }
149- if ((buf [len - count ] & 0xc0 ) != 0x80 ) {
150- return len - count ;
146+ if (c >= 0xc0 ) {
147+ if (c < 0xe0 /* 2-bytes sequence */ ? count < 2 :
148+ c < 0xf0 /* 3-bytes sequence */ ? count < 3 :
149+ c < 0xf8 /* 4-bytes sequence */ )
150+ {
151+ /* Incomplete multibyte sequence. */
152+ return len - count ;
153+ }
154+ /* Either complete or invalid sequence. */
155+ return len ;
156+ }
157+ }
158+ /* Either complete 4-bytes sequence or invalid sequence. */
159+ return len ;
160+ }
161+
162+ /* Find the number of UTF-8 bytes that corresponds to the specified number of
163+ * wchars.
164+ * I.e. find x <= len so that MultiByteToWideChar(CP_UTF8, 0, s, x, NULL, 0) == n.
165+ *
166+ * WideCharToMultiByte() cannot be used for this, because the UTF-8 -> wchar
167+ * conversion is not reversible (invalid UTF-8 byte produces \ufffd which
168+ * will be converted back to 3-bytes UTF-8 sequence \xef\xbf\xbd).
169+ * So we need to use binary search.
170+ */
171+ static DWORD
172+ _wchar_to_utf8_count (const unsigned char * s , DWORD len , DWORD n )
173+ {
174+ DWORD start = 0 ;
175+ while (1 ) {
176+ DWORD mid = 0 ;
177+ for (DWORD i = len / 2 ; i <= len ; i ++ ) {
178+ mid = _find_last_utf8_boundary (s , i );
179+ if (mid != 0 ) {
180+ break ;
181+ }
182+ /* The middle could split the first multibytes sequence. */
183+ }
184+ if (mid == len ) {
185+ return start + len ;
186+ }
187+ if (mid == 0 ) {
188+ mid = len > 1 ? len - 1 : 1 ;
189+ }
190+ DWORD wlen = MultiByteToWideChar (CP_UTF8 , 0 , s , mid , NULL , 0 );
191+ if (wlen <= n ) {
192+ s += mid ;
193+ start += mid ;
194+ len -= mid ;
195+ n -= wlen ;
196+ }
197+ else {
198+ len = mid ;
151199 }
152200 }
153201}
@@ -563,8 +611,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
563611 int err = 0 , sig = 0 ;
564612
565613 wchar_t * buf = (wchar_t * )PyMem_Malloc (maxlen * sizeof (wchar_t ));
566- if (!buf )
614+ if (!buf ) {
615+ PyErr_NoMemory ();
567616 goto error ;
617+ }
568618
569619 * readlen = 0 ;
570620
@@ -622,6 +672,7 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
622672 Py_UNBLOCK_THREADS
623673 if (!newbuf ) {
624674 sig = -1 ;
675+ PyErr_NoMemory ();
625676 break ;
626677 }
627678 buf = newbuf ;
@@ -645,8 +696,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
645696 if (* readlen > 0 && buf [0 ] == L'\x1a' ) {
646697 PyMem_Free (buf );
647698 buf = (wchar_t * )PyMem_Malloc (sizeof (wchar_t ));
648- if (!buf )
699+ if (!buf ) {
700+ PyErr_NoMemory ();
649701 goto error ;
702+ }
650703 buf [0 ] = L'\0' ;
651704 * readlen = 0 ;
652705 }
@@ -824,8 +877,10 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
824877 bufsize = BUFSIZ ;
825878
826879 buf = (wchar_t * )PyMem_Malloc ((bufsize + 1 ) * sizeof (wchar_t ));
827- if (buf == NULL )
880+ if (buf == NULL ) {
881+ PyErr_NoMemory ();
828882 return NULL ;
883+ }
829884
830885 while (1 ) {
831886 wchar_t * subbuf ;
@@ -847,6 +902,7 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
847902 (bufsize + 1 ) * sizeof (wchar_t ));
848903 if (tmp == NULL ) {
849904 PyMem_Free (buf );
905+ PyErr_NoMemory ();
850906 return NULL ;
851907 }
852908 buf = tmp ;
@@ -1022,43 +1078,49 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
10221078 len = (DWORD )b -> len ;
10231079
10241080 Py_BEGIN_ALLOW_THREADS
1025- wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , NULL , 0 );
1026-
10271081 /* issue11395 there is an unspecified upper bound on how many bytes
10281082 can be written at once. We cap at 32k - the caller will have to
10291083 handle partial writes.
10301084 Since we don't know how many input bytes are being ignored, we
10311085 have to reduce and recalculate. */
1032- while (wlen > 32766 / sizeof (wchar_t )) {
1033- len /= 2 ;
1086+ const DWORD max_wlen = 32766 / sizeof (wchar_t );
1087+ /* UTF-8 to wchar ratio is at most 3:1. */
1088+ len = Py_MIN (len , max_wlen * 3 );
1089+ while (1 ) {
10341090 /* Fix for github issues gh-110913 and gh-82052. */
10351091 len = _find_last_utf8_boundary (b -> buf , len );
10361092 wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , NULL , 0 );
1093+ if (wlen <= max_wlen ) {
1094+ break ;
1095+ }
1096+ len /= 2 ;
10371097 }
10381098 Py_END_ALLOW_THREADS
10391099
1040- if (!wlen )
1041- return PyErr_SetFromWindowsErr (0 );
1100+ if (!wlen ) {
1101+ return PyLong_FromLong (0 );
1102+ }
10421103
10431104 wbuf = (wchar_t * )PyMem_Malloc (wlen * sizeof (wchar_t ));
1105+ if (!wbuf ) {
1106+ PyErr_NoMemory ();
1107+ return NULL ;
1108+ }
10441109
10451110 Py_BEGIN_ALLOW_THREADS
10461111 wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , wbuf , wlen );
10471112 if (wlen ) {
10481113 res = WriteConsoleW (handle , wbuf , wlen , & n , NULL );
1114+ #ifdef Py_DEBUG
1115+ if (res ) {
1116+ #else
10491117 if (res && n < wlen ) {
1118+ #endif
10501119 /* Wrote fewer characters than expected, which means our
10511120 * len value may be wrong. So recalculate it from the
1052- * characters that were written. As this could potentially
1053- * result in a different value, we also validate that value.
1121+ * characters that were written.
10541122 */
1055- len = WideCharToMultiByte (CP_UTF8 , 0 , wbuf , n ,
1056- NULL , 0 , NULL , NULL );
1057- if (len ) {
1058- wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len ,
1059- NULL , 0 );
1060- assert (wlen == len );
1061- }
1123+ len = _wchar_to_utf8_count (b -> buf , len , n );
10621124 }
10631125 } else
10641126 res = 0 ;
0 commit comments