Skip to content

Reading past input end #47

@captainurist

Description

@captainurist

In __utf16_with::skip_input_error:

	if (__it == __last) {
		break;
	}
	auto __second_surrogate = __it;
	++__second_surrogate;
	if (__ztd_idk_detail_is_trail_surrogate(
		    static_cast<ztd_char32_t>(*__second_surrogate))) {
		break;
	}
	__it = ::std::move(__second_surrogate);

That *__second_surrogate reads past end.

AI-generated repro:

/**
 * Reproducer for ztd::text UTF-16 bugs.
 *
 * Bug 1: In utf16.hpp's skip_input_error function, when handling an unpaired surrogate at the end
 * of input, the code dereferences __second_surrogate without checking if it has reached __last.
 *
 * Bug 2: In utf16.hpp's decode_one function, the bounds check for incomplete surrogate pairs
 * happens BEFORE advancing the iterator, so it doesn't catch the case where a lead surrogate
 * is the last character in the input.
 *
 * To reproduce: run this test WITHOUT the fixes applied to utf16.hpp.
 */

#include <cassert>
#include <cstddef>
#include <iterator>
#include <string>

#include <gtest/gtest.h>
#include <ztd/text.hpp>

/**
 * A string_view-like class with checked iterators that assert on past-end dereference.
 * This is used to detect when ztd::text reads past the end of input.
 */
template<class CharT>
class AssertingStringView {
 public:
    using value_type = CharT;
    using size_type = std::size_t;
    using difference_type = std::ptrdiff_t;
    using pointer = const CharT *;
    using const_pointer = const CharT *;
    using reference = const CharT &;
    using const_reference = const CharT &;

    class iterator {
     public:
        using iterator_category = std::contiguous_iterator_tag;
        using value_type = CharT;
        using difference_type = std::ptrdiff_t;
        using pointer = const CharT *;
        using reference = const CharT &;

        iterator() = default;
        iterator(const CharT *ptr, const CharT *end) : _ptr(ptr), _end(end) {}

        reference operator*() const {
            assert(_ptr < _end && "AssertingStringView: cannot dereference end iterator");
            return *_ptr;
        }

        pointer operator->() const {
            assert(_ptr < _end && "AssertingStringView: cannot dereference end iterator");
            return _ptr;
        }

        iterator &operator++() { ++_ptr; return *this; }
        iterator operator++(int) { iterator tmp = *this; ++_ptr; return tmp; }
        iterator &operator--() { --_ptr; return *this; }
        iterator operator--(int) { iterator tmp = *this; --_ptr; return tmp; }

        iterator &operator+=(difference_type n) { _ptr += n; return *this; }
        iterator &operator-=(difference_type n) { _ptr -= n; return *this; }

        iterator operator+(difference_type n) const { return iterator(_ptr + n, _end); }
        iterator operator-(difference_type n) const { return iterator(_ptr - n, _end); }
        difference_type operator-(const iterator &other) const { return _ptr - other._ptr; }

        reference operator[](difference_type n) const {
            assert(_ptr + n < _end && "AssertingStringView: index out of bounds");
            return _ptr[n];
        }

        bool operator==(const iterator &other) const { return _ptr == other._ptr; }
        bool operator!=(const iterator &other) const { return _ptr != other._ptr; }
        bool operator<(const iterator &other) const { return _ptr < other._ptr; }
        bool operator<=(const iterator &other) const { return _ptr <= other._ptr; }
        bool operator>(const iterator &other) const { return _ptr > other._ptr; }
        bool operator>=(const iterator &other) const { return _ptr >= other._ptr; }

        friend iterator operator+(difference_type n, const iterator &it) { return it + n; }

     private:
        const CharT *_ptr = nullptr;
        const CharT *_end = nullptr;
    };

    using const_iterator = iterator;

    AssertingStringView() = default;
    AssertingStringView(const CharT *data, size_type size) : _data(data), _size(size) {}

    const_iterator begin() const { return iterator(_data, _data + _size); }
    const_iterator end() const { return iterator(_data + _size, _data + _size); }
    const_iterator cbegin() const { return begin(); }
    const_iterator cend() const { return end(); }

    size_type size() const { return _size; }
    bool empty() const { return _size == 0; }
    const_pointer data() const { return _data; }

    reference operator[](size_type pos) const {
        assert(pos < _size && "AssertingStringView: index out of bounds");
        return _data[pos];
    }

 private:
    const CharT *_data = nullptr;
    size_type _size = 0;
};

/**
 * Bug reproducer: Unpaired lead surrogate at end of input causes past-end dereference.
 *
 * Input: L"\xDC00\x0028" (unpaired low surrogate followed by '(')
 * Expected: Should transcode to UTF-8 with replacement character for the unpaired surrogate.
 * Actual (before fix): Asserts "cannot dereference end iterator" because skip_input_error
 *                      tries to check if the next code unit is a trail surrogate without
 *                      first checking if we've reached the end of input.
 */
TEST(ZtdTextBug, UnpairedSurrogateAtEnd) {
    // L"\xDC00" is an unpaired low surrogate (invalid UTF-16).
    // L"\x0028" is '(' (valid).
    // When transcoding, the replacement_handler is invoked for the invalid surrogate,
    // which calls skip_input_error. The bug is in skip_input_error's surrogate pair detection.
    const wchar_t data[] = L"\xDC00\x0028";
    AssertingStringView<wchar_t> input(data, 2);

    // This should not assert - it should produce replacement character + '('
    std::string result = ztd::text::transcode(input, ztd::text::wide_utf16, ztd::text::compat_utf8,
                                              ztd::text::replacement_handler);

    // U+FFFD in UTF-8 is 0xEF 0xBF 0xBD, followed by '(' (0x28)
    EXPECT_EQ(result, "\xef\xbf\xbd(");
}

/**
 * Additional test: Lead surrogate at end of input (incomplete surrogate pair).
 */
TEST(ZtdTextBug, LeadSurrogateAtEnd) {
    // L"\xD800" is a lead surrogate without a trailing surrogate.
    const wchar_t data[] = L"\xD800";
    AssertingStringView<wchar_t> input(data, 1);

    // This should produce a replacement character, not crash.
    std::string result = ztd::text::transcode(input, ztd::text::wide_utf16, ztd::text::compat_utf8,
                                              ztd::text::replacement_handler);

    // U+FFFD in UTF-8 is 0xEF 0xBF 0xBD
    EXPECT_EQ(result, "\xef\xbf\xbd");
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions