@@ -1350,10 +1350,15 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree,
1350
1350
static int find_invalid_utf8 (const char * buf , int len )
1351
1351
{
1352
1352
int offset = 0 ;
1353
+ static const unsigned int max_codepoint [] = {
1354
+ 0x7f , 0x7ff , 0xffff , 0x10ffff
1355
+ };
1353
1356
1354
1357
while (len ) {
1355
1358
unsigned char c = * buf ++ ;
1356
1359
int bytes , bad_offset ;
1360
+ unsigned int codepoint ;
1361
+ unsigned int min_val , max_val ;
1357
1362
1358
1363
len -- ;
1359
1364
offset ++ ;
@@ -1374,24 +1379,48 @@ static int find_invalid_utf8(const char *buf, int len)
1374
1379
bytes ++ ;
1375
1380
}
1376
1381
1377
- /* Must be between 1 and 5 more bytes */
1378
- if (bytes < 1 || bytes > 5 )
1382
+ /*
1383
+ * Must be between 1 and 3 more bytes. Longer sequences result in
1384
+ * codepoints beyond U+10FFFF, which are guaranteed never to exist.
1385
+ */
1386
+ if (bytes < 1 || 3 < bytes )
1379
1387
return bad_offset ;
1380
1388
1381
1389
/* Do we *have* that many bytes? */
1382
1390
if (len < bytes )
1383
1391
return bad_offset ;
1384
1392
1393
+ /*
1394
+ * Place the encoded bits at the bottom of the value and compute the
1395
+ * valid range.
1396
+ */
1397
+ codepoint = (c & 0x7f ) >> bytes ;
1398
+ min_val = max_codepoint [bytes - 1 ] + 1 ;
1399
+ max_val = max_codepoint [bytes ];
1400
+
1385
1401
offset += bytes ;
1386
1402
len -= bytes ;
1387
1403
1388
1404
/* And verify that they are good continuation bytes */
1389
1405
do {
1406
+ codepoint <<= 6 ;
1407
+ codepoint |= * buf & 0x3f ;
1390
1408
if ((* buf ++ & 0xc0 ) != 0x80 )
1391
1409
return bad_offset ;
1392
1410
} while (-- bytes );
1393
1411
1394
- /* We could/should check the value and length here too */
1412
+ /* Reject codepoints that are out of range for the sequence length. */
1413
+ if (codepoint < min_val || codepoint > max_val )
1414
+ return bad_offset ;
1415
+ /* Surrogates are only for UTF-16 and cannot be encoded in UTF-8. */
1416
+ if ((codepoint & 0x1ff800 ) == 0xd800 )
1417
+ return bad_offset ;
1418
+ /* U+xxFFFE and U+xxFFFF are guaranteed non-characters. */
1419
+ if ((codepoint & 0xffffe ) == 0xfffe )
1420
+ return bad_offset ;
1421
+ /* So are anything in the range U+FDD0..U+FDEF. */
1422
+ if (codepoint >= 0xfdd0 && codepoint <= 0xfdef )
1423
+ return bad_offset ;
1395
1424
}
1396
1425
return -1 ;
1397
1426
}
@@ -1401,9 +1430,6 @@ static int find_invalid_utf8(const char *buf, int len)
1401
1430
*
1402
1431
* If it isn't, it assumes any non-utf8 characters are Latin1,
1403
1432
* and does the conversion.
1404
- *
1405
- * Fixme: we should probably also disallow overlong forms and
1406
- * invalid characters. But we don't do that currently.
1407
1433
*/
1408
1434
static int verify_utf8 (struct strbuf * buf )
1409
1435
{
0 commit comments