@@ -132,6 +132,7 @@ static int utf_ptr2cells_len __ARGS((char_u *p, int size));
132132static int dbcs_char2cells __ARGS ((int c ));
133133static int dbcs_ptr2cells_len __ARGS ((char_u * p , int size ));
134134static int dbcs_ptr2char __ARGS ((char_u * p ));
135+ static int utf_safe_read_char_adv __ARGS ((char_u * * s , size_t * n ));
135136
136137/*
137138 * Lookup table to quickly get the length in bytes of a UTF-8 character from
@@ -1700,6 +1701,66 @@ utf_ptr2char(p)
17001701 return p [0 ];
17011702}
17021703
1704+ /*
1705+ * Convert a UTF-8 byte sequence to a wide character.
1706+ * String is assumed to be terminated by NUL or after "n" bytes, whichever
1707+ * comes first.
1708+ * The function is safe in the sense that it never accesses memory beyond the
1709+ * first "n" bytes of "s".
1710+ *
1711+ * On success, returns decoded codepoint, advances "s" to the beginning of
1712+ * next character and decreases "n" accordingly.
1713+ *
1714+ * If end of string was reached, returns 0 and, if "n" > 0, advances "s" past
1715+ * NUL byte.
1716+ *
1717+ * If byte sequence is illegal or incomplete, returns -1 and does not advance
1718+ * "s".
1719+ */
1720+ static int
1721+ utf_safe_read_char_adv (s , n )
1722+ char_u * * s ;
1723+ size_t * n ;
1724+ {
1725+ int c , k ;
1726+
1727+ if (* n == 0 ) /* end of buffer */
1728+ return 0 ;
1729+
1730+ k = utf8len_tab_zero [* * s ];
1731+
1732+ if (k == 1 )
1733+ {
1734+ /* ASCII character or NUL */
1735+ (* n )-- ;
1736+ return * (* s )++ ;
1737+ }
1738+
1739+ if ((size_t )k <= * n )
1740+ {
1741+ /* We have a multibyte sequence and it isn't truncated by buffer
1742+ * limits so utf_ptr2char() is safe to use. Or the first byte is
1743+ * illegal (k=0), and it's also safe to use utf_ptr2char(). */
1744+ c = utf_ptr2char (* s );
1745+
1746+ /* On failure, utf_ptr2char() returns the first byte, so here we
1747+ * check equality with the first byte. The only non-ASCII character
1748+ * which equals the first byte of its own UTF-8 representation is
1749+ * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
1750+ * It's safe even if n=1, else we would have k=2 > n. */
1751+ if (c != (int )(* * s ) || (c == 0xC3 && (* s )[1 ] == 0x83 ))
1752+ {
1753+ /* byte sequence was successfully decoded */
1754+ * s += k ;
1755+ * n -= k ;
1756+ return c ;
1757+ }
1758+ }
1759+
1760+ /* byte sequence is incomplete or illegal */
1761+ return -1 ;
1762+ }
1763+
17031764/*
17041765 * Get character at **pp and advance *pp to the next character.
17051766 * Note: composing characters are skipped!
@@ -2667,7 +2728,8 @@ static convertStruct foldCase[] =
26672728 {0x10400 ,0x10427 ,1 ,40 }
26682729};
26692730
2670- static int utf_convert (int a , convertStruct table [], int tableSize );
2731+ static int utf_convert __ARGS ((int a , convertStruct table [], int tableSize ));
2732+ static int utf_strnicmp __ARGS ((char_u * s1 , char_u * s2 , size_t n1 , size_t n2 ));
26712733
26722734/*
26732735 * Generic conversion function for case operations.
@@ -3079,6 +3141,80 @@ utf_isupper(a)
30793141 return (utf_tolower (a ) != a );
30803142}
30813143
3144+ static int
3145+ utf_strnicmp (s1 , s2 , n1 , n2 )
3146+ char_u * s1 , * s2 ;
3147+ size_t n1 , n2 ;
3148+ {
3149+ int c1 , c2 , cdiff ;
3150+ char_u buffer [6 ];
3151+
3152+ for (;;)
3153+ {
3154+ c1 = utf_safe_read_char_adv (& s1 , & n1 );
3155+ c2 = utf_safe_read_char_adv (& s2 , & n2 );
3156+
3157+ if (c1 <= 0 || c2 <= 0 )
3158+ break ;
3159+
3160+ if (c1 == c2 )
3161+ continue ;
3162+
3163+ cdiff = utf_fold (c1 ) - utf_fold (c2 );
3164+ if (cdiff != 0 )
3165+ return cdiff ;
3166+ }
3167+
3168+ /* some string ended or has an incomplete/illegal character sequence */
3169+
3170+ if (c1 == 0 || c2 == 0 )
3171+ {
3172+ /* some string ended. shorter string is smaller */
3173+ if (c1 == 0 && c2 == 0 )
3174+ return 0 ;
3175+ return c1 == 0 ? -1 : 1 ;
3176+ }
3177+
3178+ /* Continue with bytewise comparison to produce some result that
3179+ * would make comparison operations involving this function transitive.
3180+ *
3181+ * If only one string had an error, comparison should be made with
3182+ * folded version of the other string. In this case it is enough
3183+ * to fold just one character to determine the result of comparison. */
3184+
3185+ if (c1 != -1 && c2 == -1 )
3186+ {
3187+ n1 = utf_char2bytes (utf_fold (c1 ), buffer );
3188+ s1 = buffer ;
3189+ }
3190+ else if (c2 != -1 && c1 == -1 )
3191+ {
3192+ n2 = utf_char2bytes (utf_fold (c2 ), buffer );
3193+ s2 = buffer ;
3194+ }
3195+
3196+ while (n1 > 0 && n2 > 0 && * s1 != NUL && * s2 != NUL )
3197+ {
3198+ cdiff = (int )(* s1 ) - (int )(* s2 );
3199+ if (cdiff != 0 )
3200+ return cdiff ;
3201+
3202+ s1 ++ ;
3203+ s2 ++ ;
3204+ n1 -- ;
3205+ n2 -- ;
3206+ }
3207+
3208+ if (n1 > 0 && * s1 == NUL )
3209+ n1 = 0 ;
3210+ if (n2 > 0 && * s2 == NUL )
3211+ n2 = 0 ;
3212+
3213+ if (n1 == 0 && n2 == 0 )
3214+ return 0 ;
3215+ return n1 == 0 ? -1 : 1 ;
3216+ }
3217+
30823218/*
30833219 * Version of strnicmp() that handles multi-byte characters.
30843220 * Needed for Big5, Sjift-JIS and UTF-8 encoding. Other DBCS encodings can
@@ -3092,49 +3228,21 @@ mb_strnicmp(s1, s2, nn)
30923228 char_u * s1 , * s2 ;
30933229 size_t nn ;
30943230{
3095- int i , j , l ;
3231+ int i , l ;
30963232 int cdiff ;
3097- int incomplete = FALSE;
30983233 int n = (int )nn ;
30993234
3100- for ( i = 0 ; i < n ; i += l )
3235+ if ( enc_utf8 )
31013236 {
3102- if (s1 [i ] == NUL && s2 [i ] == NUL ) /* both strings end */
3103- return 0 ;
3104- if (enc_utf8 )
3105- {
3106- l = utf_byte2len (s1 [i ]);
3107- if (l > n - i )
3108- {
3109- l = n - i ; /* incomplete character */
3110- incomplete = TRUE;
3111- }
3112- /* Check directly first, it's faster. */
3113- for (j = 0 ; j < l ; ++ j )
3114- {
3115- if (s1 [i + j ] != s2 [i + j ])
3116- break ;
3117- if (s1 [i + j ] == 0 )
3118- /* Both stings have the same bytes but are incomplete or
3119- * have illegal bytes, accept them as equal. */
3120- l = j ;
3121- }
3122- if (j < l )
3123- {
3124- /* If one of the two characters is incomplete return -1. */
3125- if (incomplete || i + utf_byte2len (s2 [i ]) > n )
3126- return -1 ;
3127- /* Don't case-fold illegal bytes or truncated characters. */
3128- if (utf_ptr2len (s1 + i ) < l || utf_ptr2len (s2 + i ) < l )
3129- return -1 ;
3130- cdiff = utf_fold (utf_ptr2char (s1 + i ))
3131- - utf_fold (utf_ptr2char (s2 + i ));
3132- if (cdiff != 0 )
3133- return cdiff ;
3134- }
3135- }
3136- else
3237+ return utf_strnicmp (s1 , s2 , nn , nn );
3238+ }
3239+ else
3240+ {
3241+ for (i = 0 ; i < n ; i += l )
31373242 {
3243+ if (s1 [i ] == NUL && s2 [i ] == NUL ) /* both strings end */
3244+ return 0 ;
3245+
31383246 l = (* mb_ptr2len )(s1 + i );
31393247 if (l <= 1 )
31403248 {
0 commit comments