-
-
Notifications
You must be signed in to change notification settings - Fork 30
Open
Description
In __utf16_with::skip_input_error:
if (__it == __last) {
break;
}
auto __second_surrogate = __it;
++__second_surrogate;
if (__ztd_idk_detail_is_trail_surrogate(
static_cast<ztd_char32_t>(*__second_surrogate))) {
break;
}
__it = ::std::move(__second_surrogate);That *__second_surrogate reads past end.
AI-generated repro:
/**
* Reproducer for ztd::text UTF-16 bugs.
*
* Bug 1: In utf16.hpp's skip_input_error function, when handling an unpaired surrogate at the end
* of input, the code dereferences __second_surrogate without checking if it has reached __last.
*
* Bug 2: In utf16.hpp's decode_one function, the bounds check for incomplete surrogate pairs
* happens BEFORE advancing the iterator, so it doesn't catch the case where a lead surrogate
* is the last character in the input.
*
* To reproduce: run this test WITHOUT the fixes applied to utf16.hpp.
*/
#include <cassert>
#include <cstddef>
#include <iterator>
#include <string>
#include <gtest/gtest.h>
#include <ztd/text.hpp>
/**
* A string_view-like class with checked iterators that assert on past-end dereference.
* This is used to detect when ztd::text reads past the end of input.
*/
template<class CharT>
class AssertingStringView {
public:
using value_type = CharT;
using size_type = std::size_t;
using difference_type = std::ptrdiff_t;
using pointer = const CharT *;
using const_pointer = const CharT *;
using reference = const CharT &;
using const_reference = const CharT &;
class iterator {
public:
using iterator_category = std::contiguous_iterator_tag;
using value_type = CharT;
using difference_type = std::ptrdiff_t;
using pointer = const CharT *;
using reference = const CharT &;
iterator() = default;
iterator(const CharT *ptr, const CharT *end) : _ptr(ptr), _end(end) {}
reference operator*() const {
assert(_ptr < _end && "AssertingStringView: cannot dereference end iterator");
return *_ptr;
}
pointer operator->() const {
assert(_ptr < _end && "AssertingStringView: cannot dereference end iterator");
return _ptr;
}
iterator &operator++() { ++_ptr; return *this; }
iterator operator++(int) { iterator tmp = *this; ++_ptr; return tmp; }
iterator &operator--() { --_ptr; return *this; }
iterator operator--(int) { iterator tmp = *this; --_ptr; return tmp; }
iterator &operator+=(difference_type n) { _ptr += n; return *this; }
iterator &operator-=(difference_type n) { _ptr -= n; return *this; }
iterator operator+(difference_type n) const { return iterator(_ptr + n, _end); }
iterator operator-(difference_type n) const { return iterator(_ptr - n, _end); }
difference_type operator-(const iterator &other) const { return _ptr - other._ptr; }
reference operator[](difference_type n) const {
assert(_ptr + n < _end && "AssertingStringView: index out of bounds");
return _ptr[n];
}
bool operator==(const iterator &other) const { return _ptr == other._ptr; }
bool operator!=(const iterator &other) const { return _ptr != other._ptr; }
bool operator<(const iterator &other) const { return _ptr < other._ptr; }
bool operator<=(const iterator &other) const { return _ptr <= other._ptr; }
bool operator>(const iterator &other) const { return _ptr > other._ptr; }
bool operator>=(const iterator &other) const { return _ptr >= other._ptr; }
friend iterator operator+(difference_type n, const iterator &it) { return it + n; }
private:
const CharT *_ptr = nullptr;
const CharT *_end = nullptr;
};
using const_iterator = iterator;
AssertingStringView() = default;
AssertingStringView(const CharT *data, size_type size) : _data(data), _size(size) {}
const_iterator begin() const { return iterator(_data, _data + _size); }
const_iterator end() const { return iterator(_data + _size, _data + _size); }
const_iterator cbegin() const { return begin(); }
const_iterator cend() const { return end(); }
size_type size() const { return _size; }
bool empty() const { return _size == 0; }
const_pointer data() const { return _data; }
reference operator[](size_type pos) const {
assert(pos < _size && "AssertingStringView: index out of bounds");
return _data[pos];
}
private:
const CharT *_data = nullptr;
size_type _size = 0;
};
/**
* Bug reproducer: Unpaired lead surrogate at end of input causes past-end dereference.
*
* Input: L"\xDC00\x0028" (unpaired low surrogate followed by '(')
* Expected: Should transcode to UTF-8 with replacement character for the unpaired surrogate.
* Actual (before fix): Asserts "cannot dereference end iterator" because skip_input_error
* tries to check if the next code unit is a trail surrogate without
* first checking if we've reached the end of input.
*/
TEST(ZtdTextBug, UnpairedSurrogateAtEnd) {
// L"\xDC00" is an unpaired low surrogate (invalid UTF-16).
// L"\x0028" is '(' (valid).
// When transcoding, the replacement_handler is invoked for the invalid surrogate,
// which calls skip_input_error. The bug is in skip_input_error's surrogate pair detection.
const wchar_t data[] = L"\xDC00\x0028";
AssertingStringView<wchar_t> input(data, 2);
// This should not assert - it should produce replacement character + '('
std::string result = ztd::text::transcode(input, ztd::text::wide_utf16, ztd::text::compat_utf8,
ztd::text::replacement_handler);
// U+FFFD in UTF-8 is 0xEF 0xBF 0xBD, followed by '(' (0x28)
EXPECT_EQ(result, "\xef\xbf\xbd(");
}
/**
* Additional test: Lead surrogate at end of input (incomplete surrogate pair).
*/
TEST(ZtdTextBug, LeadSurrogateAtEnd) {
// L"\xD800" is a lead surrogate without a trailing surrogate.
const wchar_t data[] = L"\xD800";
AssertingStringView<wchar_t> input(data, 1);
// This should produce a replacement character, not crash.
std::string result = ztd::text::transcode(input, ztd::text::wide_utf16, ztd::text::compat_utf8,
ztd::text::replacement_handler);
// U+FFFD in UTF-8 is 0xEF 0xBF 0xBD
EXPECT_EQ(result, "\xef\xbf\xbd");
}Metadata
Metadata
Assignees
Labels
No labels