Skip to content
This repository was archived by the owner on Aug 29, 2025. It is now read-only.

Commit 9274ad8

Browse files
DuffsDeviceJakob Riedle
andauthored
Added basic_utf8_string::starts_with/ends_with (#54)
* Added basic_utf8_string::starts_with/ends_with * Added test cases for starts_with and ends_with Co-authored-by: Jakob Riedle <[email protected]>
1 parent 276f8c6 commit 9274ad8

File tree

2 files changed

+206
-11
lines changed

2 files changed

+206
-11
lines changed

include/tinyutf8/tinyutf8.h

Lines changed: 187 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,11 @@ namespace tiny_utf8
190190
char last;
191191
} bytes;
192192
};
193+
194+
//! strlen for different character types
195+
template<typename T>
196+
inline std::size_t strlen( const T* str ){ std::size_t len = 0u; while( *str++ ) ++len; return len; }
197+
template<> inline std::size_t strlen<char>( const char* str ){ return std::strlen( str ); }
193198
}
194199

195200

@@ -301,7 +306,7 @@ namespace tiny_utf8
301306
protected:
302307

303308
difference_type t_raw_index;
304-
Container* t_instance;
309+
Container* t_instance = nullptr;
305310

306311
protected:
307312

@@ -1590,10 +1595,10 @@ namespace tiny_utf8
15901595
*/
15911596
inline raw_reference back() noexcept { return { back_index() , this }; }
15921597
inline value_type back() const noexcept {
1593-
size_type sz = size();
1598+
size_type my_size = size();
15941599
const data_type* buffer = get_buffer();
1595-
width_type bytes = get_num_bytes_of_utf8_char_before( buffer , sz );
1596-
return decode_utf8( buffer + sz - bytes , bytes );
1600+
width_type bytes = get_num_bytes_of_utf8_char_before( buffer , my_size );
1601+
return decode_utf8( buffer + my_size - bytes , bytes );
15971602
}
15981603

15991604

@@ -2130,6 +2135,177 @@ namespace tiny_utf8
21302135
size_type raw_find_last_not_of( const value_type* str , size_type start_byte = basic_utf8_string::npos ) const noexcept ;
21312136

21322137

2138+
/**
2139+
* Check, whether this string ends with the supplied character sequence
2140+
*
2141+
* @param str The string to compare the end of this string with
2142+
* @return true, if this string ends with the sequence 'str', false otherwise.
2143+
*/
2144+
inline bool starts_with( const basic_utf8_string& str ) const noexcept {
2145+
size_type my_size = size(), str_size = str.size();
2146+
return my_size >= str_size && std::memcmp( data() , str.data() , str_size ) == 0;
2147+
}
2148+
/**
2149+
* Check, whether this string ends with the supplied character sequence
2150+
*
2151+
* @param str The string to compare the end of this string with
2152+
* @return true, if this string ends with the sequence 'str', false otherwise.
2153+
*/
2154+
inline bool starts_with( const std::string& str ) const noexcept {
2155+
size_type my_size = size(), str_size = str.size();
2156+
return my_size >= str_size && std::memcmp( data() , str.data() , str_size ) == 0;
2157+
}
2158+
/**
2159+
* Check, whether this string ends with the supplied codepoint
2160+
*
2161+
* @param str The codepoint to compare the end of this string with
2162+
* @return true, if this string ends with the codepoint 'cp', false otherwise.
2163+
*/
2164+
inline bool starts_with( value_type cp ) const noexcept {
2165+
return !empty() && front() == cp;
2166+
}
2167+
/**
2168+
* Check, whether this string ends with the supplied UTF-8 sequence.
2169+
*
2170+
* @param str Null-terminated string literal, interpreted as UTF-8. The pointer is expected to be valid
2171+
* @return true, if this string ends with the sequence 'str', false otherwise.
2172+
*/
2173+
template<typename T>
2174+
bool starts_with( T str , enable_if_ptr<T, data_type>* = {} ) const noexcept {
2175+
size_type my_size = size(), str_size = std::strlen( str );
2176+
if( my_size < str_size )
2177+
return false;
2178+
for( const data_type* my_data = data() ; *str && *str == *my_data ; ++str, ++my_data );
2179+
return !*str;
2180+
}
2181+
/**
2182+
* Check, whether this string ends with the supplied UTF-8 sequence.
2183+
*
2184+
* @param str Pointer to a string literal with possibly embedded zeros, interpreted as UTF-8. The pointer is expected to be valid
2185+
* @return true, if this string ends with the sequence 'str', false otherwise.
2186+
*/
2187+
template<size_type LITLEN>
2188+
bool starts_with( const data_type (&str)[LITLEN] ) const noexcept {
2189+
size_type my_size = size(), str_size = str[LITLEN-1] ? LITLEN : LITLEN-1;
2190+
return my_size >= str_size && std::memcmp( data() , str , str_size ) == 0;
2191+
}
2192+
/**
2193+
* Check, whether this string ends with the supplied codepoint sequence.
2194+
*
2195+
* @param str Pointer to a null-terminated string literal, interpreted as UTF-32. The pointer is expected to be valid
2196+
* @return true, if this string ends with the sequence 'str', false otherwise.
2197+
*/
2198+
template<typename T>
2199+
bool starts_with( T str , enable_if_ptr<T, value_type>* = {} ) const noexcept {
2200+
for( const_iterator it = cbegin(), end = cend() ; *str && it != end && *str == *it ; ++str, ++it );
2201+
return !*str;
2202+
}
2203+
/**
2204+
* Check, whether this string ends with the supplied codepoint sequence.
2205+
*
2206+
* @param str Pointer to a string literal with possibly embedded zeros, interpreted as UTF-32. The pointer is expected to be valid
2207+
* @return true, if this string ends with the sequence 'str', false otherwise.
2208+
*/
2209+
template<size_type LITLEN>
2210+
bool starts_with( const value_type (&str)[LITLEN] ) const noexcept {
2211+
size_type str_len = str[LITLEN-1] ? LITLEN : LITLEN-1;
2212+
const_iterator it = cbegin(), end = cend();
2213+
while( it != end && str_len ){
2214+
if( *it != *str )
2215+
return false;
2216+
++it, ++str, --str_len;
2217+
}
2218+
return !str_len;
2219+
}
2220+
2221+
2222+
/**
2223+
* Check, whether this string ends with the supplied character sequence
2224+
*
2225+
* @param str The string to compare the end of this string with
2226+
* @return true, if this string ends with the sequence 'str', false otherwise.
2227+
*/
2228+
inline bool ends_with( const basic_utf8_string& str ) const noexcept {
2229+
size_type my_size = size(), str_size = str.size();
2230+
return my_size >= str_size && std::memcmp( data() + my_size - str_size , str.data() , str_size ) == 0;
2231+
}
2232+
/**
2233+
* Check, whether this string ends with the supplied character sequence
2234+
*
2235+
* @param str The string to compare the end of this string with
2236+
* @return true, if this string ends with the sequence 'str', false otherwise.
2237+
*/
2238+
inline bool ends_with( const std::string& str ) const noexcept {
2239+
size_type my_size = size(), str_size = str.size();
2240+
return my_size >= str_size && std::memcmp( data() + my_size - str_size , str.data() , str_size ) == 0;
2241+
}
2242+
/**
2243+
* Check, whether this string ends with the supplied codepoint
2244+
*
2245+
* @param str The codepoint to compare the end of this string with
2246+
* @return true, if this string ends with the codepoint 'cp', false otherwise.
2247+
*/
2248+
inline bool ends_with( value_type cp ) const noexcept {
2249+
return !empty() && back() == cp;
2250+
}
2251+
/**
2252+
* Check, whether this string ends with the supplied UTF-8 sequence.
2253+
*
2254+
* @param str Null-terminated string literal, interpreted as UTF-8. The pointer is expected to be valid
2255+
* @return true, if this string ends with the sequence 'str', false otherwise.
2256+
*/
2257+
template<typename T>
2258+
bool ends_with( T str , enable_if_ptr<T, data_type>* = {} ) const noexcept {
2259+
size_type my_size = size(), str_size = std::strlen(str);
2260+
return my_size >= str_size && std::memcmp( data() + my_size - str_size , str , str_size ) == 0;
2261+
}
2262+
/**
2263+
* Check, whether this string ends with the supplied UTF-8 sequence.
2264+
*
2265+
* @param str Pointer to a string literal with possibly embedded zeros, interpreted as UTF-8. The pointer is expected to be valid
2266+
* @return true, if this string ends with the sequence 'str', false otherwise.
2267+
*/
2268+
template<size_type LITLEN>
2269+
bool ends_with( const data_type (&str)[LITLEN] ) const noexcept {
2270+
size_type my_size = size(), str_size = str[LITLEN-1] ? LITLEN : LITLEN-1;
2271+
return my_size >= str_size && std::memcmp( data() + my_size - str_size , str , str_size ) == 0;
2272+
}
2273+
/**
2274+
* Check, whether this string ends with the supplied codepoint sequence.
2275+
*
2276+
* @param str Pointer to a null-terminated string literal, interpreted as UTF-32. The pointer is expected to be valid
2277+
* @return true, if this string ends with the sequence 'str', false otherwise.
2278+
*/
2279+
template<typename T>
2280+
bool ends_with( T str , enable_if_ptr<T, value_type>* = {} ) const noexcept {
2281+
size_type str_len = tiny_utf8_detail::strlen( str );
2282+
const_reverse_iterator it = crbegin(), end = crend();
2283+
while( it != end && str_len ){
2284+
if( *it != str[--str_len] )
2285+
return false;
2286+
++it;
2287+
}
2288+
return !str_len;
2289+
}
2290+
/**
2291+
* Check, whether this string ends with the supplied codepoint sequence.
2292+
*
2293+
* @param str Pointer to a string literal with possibly embedded zeros, interpreted as UTF-32. The pointer is expected to be valid
2294+
* @return true, if this string ends with the sequence 'str', false otherwise.
2295+
*/
2296+
template<size_type LITLEN>
2297+
bool ends_with( const value_type (&str)[LITLEN] ) const noexcept {
2298+
size_type str_len = str[LITLEN-1] ? LITLEN : LITLEN-1;
2299+
const_reverse_iterator it = crbegin(), end = crend();
2300+
while( it != end && str_len ){
2301+
if( *it != str[--str_len] )
2302+
return false;
2303+
++it;
2304+
}
2305+
return !str_len;
2306+
}
2307+
2308+
21332309
/**
21342310
* Compare this string with the supplied one.
21352311
*
@@ -2141,8 +2317,8 @@ namespace tiny_utf8
21412317
* the compared string, or all compared characters match but the compared string is longer.
21422318
*/
21432319
inline int compare( const basic_utf8_string& str ) const noexcept {
2144-
size_type my_size = size(), str_size = str.size();
2145-
int result = std::memcmp( data() , str.data() , my_size < str_size ? my_size : str_size );
2320+
size_type my_size = size(), str_size = str.size();
2321+
int result = std::memcmp( data() , str.data() , my_size < str_size ? my_size : str_size );
21462322
if( !result && my_size != str_size )
21472323
result = my_size < str_size ? -1 : 1;
21482324
return result;
@@ -2158,15 +2334,15 @@ namespace tiny_utf8
21582334
* the compared string, or all compared characters match but the compared string is longer.
21592335
*/
21602336
inline int compare( const std::string& str ) const noexcept {
2161-
size_type my_size = size(), str_size = str.size();
2162-
int result = std::memcmp( data() , str.data() , my_size < str_size ? my_size : str_size );
2337+
size_type my_size = size(), str_size = str.size();
2338+
int result = std::memcmp( data() , str.data() , my_size < str_size ? my_size : str_size );
21632339
if( !result && my_size != str_size )
21642340
result = my_size < str_size ? -1 : 1;
21652341
return result;
21662342
}
21672343
/**
21682344
* Compares this string with the supplied one.
2169-
* Thes supplied string literal is considered to end with the trailling '\0'.
2345+
* The supplied string literal is assumed to end at the (possibly trailling) '\0'.
21702346
* This is especially important, if this utf8 string contains embedded zeros.
21712347
*
21722348
* @param str Null-terminated string literal, interpreted as UTF-8. The pointer is expected to be valid
@@ -2209,7 +2385,7 @@ namespace tiny_utf8
22092385
}
22102386
/**
22112387
* Compares this string with the supplied one.
2212-
* Thes supplied string literal is considered to end with the trailling '\0'.
2388+
* Thes supplied string literal is assumed to end at the (possibly trailling) '\0'.
22132389
* This is especially important, if this utf8 string contains embedded zeros.
22142390
*
22152391
* @param str Pointer to a null-terminated string literal, interpreted as UTF-32. The pointer is expected to be valid
@@ -2242,7 +2418,7 @@ namespace tiny_utf8
22422418
template<size_type LITLEN>
22432419
int compare( const value_type (&str)[LITLEN] ) const noexcept {
22442420
const_iterator it = cbegin(), end = cend();
2245-
size_type index = 0, length = str[LITLEN-1] ? LITLEN : LITLEN-1;
2421+
size_type index = 0, length = str[LITLEN-1] ? LITLEN : LITLEN-1;
22462422
while( it != end && index < length ){
22472423
if( *it != str[index] )
22482424
return *it < str[index] ? -1 : 1;

test/src/test_search.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,22 @@ TEST(TinyUTF8, FindSubstr)
1616
EXPECT_EQ(str.find(find), 2);
1717
EXPECT_EQ(str.rfind(rfind), 9);
1818
}
19+
20+
TEST(TinyUTF8, StartsEndsWith)
21+
{
22+
tiny_utf8::utf8_string str = U"Hello World ツ♫";
23+
24+
const char32_t* ends_with_positive = U"ツ♫";
25+
const char32_t* ends_with_negative = U"e";
26+
const char32_t* starts_with_positive = U"Hello ";
27+
const char32_t* starts_with_negative = U"Hell ";
28+
29+
EXPECT_EQ(str.ends_with(ends_with_positive), true);
30+
EXPECT_EQ(str.ends_with(ends_with_negative), false);
31+
EXPECT_EQ(str.ends_with(tiny_utf8::utf8_string(ends_with_positive)), true);
32+
EXPECT_EQ(str.ends_with(tiny_utf8::utf8_string(ends_with_negative)), false);
33+
EXPECT_EQ(str.starts_with(starts_with_positive), true);
34+
EXPECT_EQ(str.starts_with(starts_with_negative), false);
35+
EXPECT_EQ(str.starts_with(tiny_utf8::utf8_string(starts_with_positive)), true);
36+
EXPECT_EQ(str.starts_with(tiny_utf8::utf8_string(starts_with_negative)), false);
37+
}

0 commit comments

Comments
 (0)