-
Notifications
You must be signed in to change notification settings - Fork 804
Fix potential unicode conversion issues for *nix #7506
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
tex3d
wants to merge
3
commits into
microsoft:main
Choose a base branch
from
tex3d:unicode-fixes
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,9 @@ int MultiByteToWideChar(uint32_t /*CodePage*/, uint32_t /*dwFlags*/, | |
const char *lpMultiByteStr, int cbMultiByte, | ||
wchar_t *lpWideCharStr, int cchWideChar) { | ||
|
||
if (cbMultiByte == 0) { | ||
// Check for invalid sizes or potential overflow. | ||
if (cbMultiByte == 0 || cbMultiByte < -1 || cbMultiByte > (INT32_MAX - 1) || | ||
cchWideChar < 0 || cchWideChar > (INT32_MAX - 1)) { | ||
SetLastError(ERROR_INVALID_PARAMETER); | ||
return 0; | ||
} | ||
|
@@ -42,18 +44,17 @@ int MultiByteToWideChar(uint32_t /*CodePage*/, uint32_t /*dwFlags*/, | |
++cbMultiByte; | ||
} | ||
// If zero is given as the destination size, this function should | ||
// return the required size (including the null-terminating character). | ||
// return the required size (including or excluding the null-terminating | ||
// character depending on whether the input included the null-terminator). | ||
// This is the behavior of mbstowcs when the target is null. | ||
if (cchWideChar == 0) { | ||
lpWideCharStr = nullptr; | ||
} else if (cchWideChar < cbMultiByte) { | ||
SetLastError(ERROR_INSUFFICIENT_BUFFER); | ||
return 0; | ||
} | ||
|
||
ScopedLocale utf8_locale_scope(CP_UTF8); | ||
|
||
bool isNullTerminated = false; | ||
size_t rv; | ||
const char *prevLocale = setlocale(LC_ALL, nullptr); | ||
setlocale(LC_ALL, "en_US.UTF-8"); | ||
if (lpMultiByteStr[cbMultiByte - 1] != '\0') { | ||
char *srcStr = (char *)malloc((cbMultiByte + 1) * sizeof(char)); | ||
strncpy(srcStr, lpMultiByteStr, cbMultiByte); | ||
|
@@ -62,14 +63,29 @@ int MultiByteToWideChar(uint32_t /*CodePage*/, uint32_t /*dwFlags*/, | |
free(srcStr); | ||
} else { | ||
rv = mbstowcs(lpWideCharStr, lpMultiByteStr, cchWideChar); | ||
isNullTerminated = true; | ||
} | ||
|
||
if (rv == ~(size_t)0) { | ||
// mbstowcs returns -1 on error. | ||
SetLastError(ERROR_INVALID_PARAMETER); | ||
return 0; | ||
} | ||
|
||
if (prevLocale) | ||
setlocale(LC_ALL, prevLocale); | ||
// Return value of mbstowcs (rv) excludes the terminating character. | ||
// Matching MultiByteToWideChar requires returning the size written including | ||
// the null terminator if the input was null-terminated, otherwise it | ||
// returns the size written excluding the null terminator. | ||
if (isNullTerminated) | ||
rv += 1; | ||
|
||
// Check for overflow when returning the size. | ||
if (rv >= INT32_MAX) { | ||
SetLastError(ERROR_INVALID_PARAMETER); | ||
return 0; // Overflow error | ||
} | ||
|
||
if (rv == (size_t)cbMultiByte) | ||
return rv; | ||
return rv + 1; // mbstowcs excludes the terminating character | ||
return rv; | ||
} | ||
|
||
// WideCharToMultiByte is a Windows-specific method. | ||
|
@@ -84,7 +100,9 @@ int WideCharToMultiByte(uint32_t /*CodePage*/, uint32_t /*dwFlags*/, | |
*lpUsedDefaultChar = FALSE; | ||
} | ||
|
||
if (cchWideChar == 0) { | ||
// Check for invalid sizes or potential overflow. | ||
if (cchWideChar == 0 || cchWideChar < -1 || cchWideChar > (INT32_MAX - 1) || | ||
cbMultiByte < 0 || cbMultiByte > (INT32_MAX - 1)) { | ||
SetLastError(ERROR_INVALID_PARAMETER); | ||
return 0; | ||
} | ||
|
@@ -98,18 +116,17 @@ int WideCharToMultiByte(uint32_t /*CodePage*/, uint32_t /*dwFlags*/, | |
++cchWideChar; | ||
} | ||
// If zero is given as the destination size, this function should | ||
// return the required size (including the null-terminating character). | ||
// return the required size (including or excluding the null-terminating | ||
// character depending on whether the input included the null-terminator). | ||
// This is the behavior of wcstombs when the target is null. | ||
if (cbMultiByte == 0) { | ||
lpMultiByteStr = nullptr; | ||
} else if (cbMultiByte < cchWideChar) { | ||
SetLastError(ERROR_INSUFFICIENT_BUFFER); | ||
return 0; | ||
} | ||
|
||
ScopedLocale utf8_locale_scope(CP_UTF8); | ||
|
||
bool isNullTerminated = false; | ||
size_t rv; | ||
const char *prevLocale = setlocale(LC_ALL, nullptr); | ||
setlocale(LC_ALL, "en_US.UTF-8"); | ||
if (lpWideCharStr[cchWideChar - 1] != L'\0') { | ||
wchar_t *srcStr = (wchar_t *)malloc((cchWideChar + 1) * sizeof(wchar_t)); | ||
wcsncpy(srcStr, lpWideCharStr, cchWideChar); | ||
|
@@ -118,21 +135,40 @@ int WideCharToMultiByte(uint32_t /*CodePage*/, uint32_t /*dwFlags*/, | |
free(srcStr); | ||
} else { | ||
rv = wcstombs(lpMultiByteStr, lpWideCharStr, cbMultiByte); | ||
isNullTerminated = true; | ||
} | ||
|
||
if (rv == ~(size_t)0) { | ||
// wcstombs returns -1 on error. | ||
SetLastError(ERROR_INVALID_PARAMETER); | ||
return 0; | ||
} | ||
|
||
if (prevLocale) | ||
setlocale(LC_ALL, prevLocale); | ||
// Return value of wcstombs (rv) excludes the terminating character. | ||
// Matching MultiByteToWideChar requires returning the size written including | ||
// the null terminator if the input was null-terminated, otherwise it | ||
// returns the size written excluding the null terminator. | ||
if (isNullTerminated) | ||
rv += 1; | ||
|
||
// Check for overflow when returning the size. | ||
if (rv >= INT32_MAX) { | ||
SetLastError(ERROR_INVALID_PARAMETER); | ||
return 0; // Overflow error | ||
} | ||
|
||
if (rv == (size_t)cchWideChar) | ||
return rv; | ||
return rv + 1; // mbstowcs excludes the terminating character | ||
return rv; | ||
} | ||
#endif // _WIN32 | ||
|
||
namespace Unicode { | ||
|
||
bool WideToEncodedString(const wchar_t *text, size_t cWide, DWORD cp, | ||
DWORD flags, std::string *pValue, bool *lossy) { | ||
DXASSERT_NOMSG(cWide == ~(size_t)0 || cWide < INT32_MAX); | ||
if (text == nullptr || pValue == nullptr || cWide == 0 || cWide >= INT32_MAX) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since |
||
return false; | ||
|
||
BOOL usedDefaultChar; | ||
LPBOOL pUsedDefaultChar = (lossy == nullptr) ? nullptr : &usedDefaultChar; | ||
if (lossy != nullptr) | ||
|
@@ -147,31 +183,37 @@ bool WideToEncodedString(const wchar_t *text, size_t cWide, DWORD cp, | |
return true; | ||
} | ||
|
||
int cbUTF8 = ::WideCharToMultiByte(cp, flags, text, cWide, nullptr, 0, | ||
nullptr, pUsedDefaultChar); | ||
int cbUTF8 = ::WideCharToMultiByte(cp, flags, text, static_cast<int>(cWide), | ||
nullptr, 0, nullptr, pUsedDefaultChar); | ||
if (cbUTF8 == 0) | ||
return false; | ||
|
||
pValue->resize(cbUTF8); | ||
|
||
cbUTF8 = ::WideCharToMultiByte(cp, flags, text, cWide, &(*pValue)[0], | ||
pValue->size(), nullptr, pUsedDefaultChar); | ||
cbUTF8 = ::WideCharToMultiByte(cp, flags, text, static_cast<int>(cWide), | ||
&(*pValue)[0], pValue->size(), nullptr, | ||
pUsedDefaultChar); | ||
DXASSERT(cbUTF8 > 0, "otherwise contents have changed"); | ||
DXASSERT((*pValue)[pValue->size()] == '\0', | ||
"otherwise string didn't null-terminate after resize() call"); | ||
if ((cWide == ~(size_t)0 || text[cWide - 1] == L'\0') && | ||
(*pValue)[pValue->size() - 1] == '\0') { | ||
// When the input is null-terminated, the output includes the null | ||
// terminator. Reduce the size by 1 to remove the embedded null terminator | ||
// inside the string. | ||
pValue->resize(cbUTF8 - 1); | ||
} | ||
|
||
if (lossy != nullptr) | ||
*lossy = usedDefaultChar; | ||
return true; | ||
} | ||
|
||
bool UTF8ToWideString(const char *pUTF8, std::wstring *pWide) { | ||
size_t cbUTF8 = (pUTF8 == nullptr) ? 0 : strlen(pUTF8); | ||
return UTF8ToWideString(pUTF8, cbUTF8, pWide); | ||
return UTF8ToWideString(pUTF8, -1, pWide); | ||
} | ||
|
||
bool UTF8ToWideString(const char *pUTF8, size_t cbUTF8, std::wstring *pWide) { | ||
DXASSERT_NOMSG(pWide != nullptr); | ||
DXASSERT_NOMSG(cbUTF8 == ~(size_t)0 || cbUTF8 < INT32_MAX); | ||
|
||
// Handle zero-length as a special case; it's a special value to indicate | ||
// errors in MultiByteToWideChar. | ||
|
@@ -181,17 +223,23 @@ bool UTF8ToWideString(const char *pUTF8, size_t cbUTF8, std::wstring *pWide) { | |
} | ||
|
||
int cWide = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, | ||
cbUTF8, nullptr, 0); | ||
static_cast<int>(cbUTF8), nullptr, 0); | ||
if (cWide == 0) | ||
return false; | ||
|
||
pWide->resize(cWide); | ||
|
||
cWide = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, cbUTF8, | ||
&(*pWide)[0], pWide->size()); | ||
cWide = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, | ||
static_cast<int>(cbUTF8), &(*pWide)[0], | ||
pWide->size()); | ||
DXASSERT(cWide > 0, "otherwise contents changed"); | ||
DXASSERT((*pWide)[pWide->size()] == L'\0', | ||
"otherwise wstring didn't null-terminate after resize() call"); | ||
if ((cbUTF8 == ~(size_t)0 || pUTF8[cbUTF8 - 1] == '\0') && | ||
(*pWide)[pWide->size() - 1] == '\0') { | ||
// When the input is null-terminated, the output includes the null | ||
// terminator. Reduce the size by 1 to remove the embedded null terminator | ||
// inside the string. | ||
pWide->resize(cWide - 1); | ||
} | ||
return true; | ||
} | ||
|
||
|
@@ -213,11 +261,12 @@ bool UTF8ToConsoleString(const char *text, size_t textLen, std::string *pValue, | |
if (!UTF8ToWideString(text, textLen, &text16)) { | ||
return false; | ||
} | ||
return WideToConsoleString(text16.c_str(), text16.length(), pValue, lossy); | ||
return WideToConsoleString(text16.c_str(), text16.length() + 1, pValue, | ||
lossy); | ||
} | ||
|
||
bool UTF8ToConsoleString(const char *text, std::string *pValue, bool *lossy) { | ||
return UTF8ToConsoleString(text, strlen(text), pValue, lossy); | ||
return UTF8ToConsoleString(text, ~(size_t)0, pValue, lossy); | ||
} | ||
|
||
bool WideToConsoleString(const wchar_t *text, size_t textLen, | ||
|
@@ -230,7 +279,7 @@ bool WideToConsoleString(const wchar_t *text, size_t textLen, | |
|
||
bool WideToConsoleString(const wchar_t *text, std::string *pValue, | ||
bool *lossy) { | ||
return WideToConsoleString(text, wcslen(text), pValue, lossy); | ||
return WideToConsoleString(text, ~(size_t)0, pValue, lossy); | ||
} | ||
|
||
bool WideToUTF8String(const wchar_t *pWide, size_t cWide, std::string *pUTF8) { | ||
|
@@ -242,7 +291,7 @@ bool WideToUTF8String(const wchar_t *pWide, size_t cWide, std::string *pUTF8) { | |
bool WideToUTF8String(const wchar_t *pWide, std::string *pUTF8) { | ||
DXASSERT_NOMSG(pWide != nullptr); | ||
DXASSERT_NOMSG(pUTF8 != nullptr); | ||
return WideToEncodedString(pWide, wcslen(pWide), CP_UTF8, 0, pUTF8, nullptr); | ||
return WideToEncodedString(pWide, ~(size_t)0, CP_UTF8, 0, pUTF8, nullptr); | ||
} | ||
|
||
std::string WideToUTF8StringOrThrow(const wchar_t *pWide) { | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't
cbMultiByte > (INT32_MAX - 1)
equivalent tocbMultiByte == INT32_MAX
? Unless we were to changecbMultiByte
andcchWideChar
to larger types I think the equality check is clearer, no?