@@ -475,18 +475,22 @@ PERL_STATIC_INLINE int
475475S_is_utf8_overlong (const U8 * const s , const STRLEN len )
476476{
477477 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
478- * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
479- * it isn't, and -1 if there isn't enough information to tell. This last
480- * return value can happen if the sequence is incomplete, missing some
481- * trailing bytes that would form a complete character. If there are
482- * enough bytes to make a definitive decision, this function does so.
483- * Usually 2 bytes are sufficient .
478+ * 's' + 'len' - 1 is an overlong. It returns a positive number if it is
479+ * an overlong; 0 if it isn't, and -1 if there isn't enough information to
480+ * tell. This last return value can happen if the sequence is incomplete,
481+ * missing some trailing bytes that would form a complete character. If
482+ * there are enough bytes to make a definitive decision, this function does
483+ * so .
484484 *
485- * Overlongs can occur whenever the number of continuation bytes changes.
486- * That means whenever the number of leading 1 bits in a start byte
487- * increases from the next lower start byte. That happens for start bytes
488- * C0, E0, F0, F8, FC, FE, and FF.
489- */
485+ * The positive number returned when it is overlong is how many bytes
486+ * needed to be examined to make that determination. Usually 1 or 2 bytes
487+ * are sufficient.
488+ *
489+ * Overlongs can occur for a few of the smallest start bytes or whenever
490+ * the number of continuation bytes changes. The latter means whenever the
491+ * number of leading 1 bits in a start byte increases from the next lower
492+ * start byte. That happens for start bytes C0, E0, F0, F8, FC, FE, and
493+ * FF. */
490494
491495 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG ;
492496
@@ -512,7 +516,7 @@ S_is_utf8_overlong(const U8 * const s, const STRLEN len)
512516 return 1 ;
513517#else
514518 case 0xE0 :
515- return (len < 2 ) ? -1 : s [1 ] < 0xA0 ;
519+ return (len < 2 ) ? -1 : ( s [1 ] < 0xA0 ) ? 2 : 0 ;
516520#endif
517521
518522 case 0xF0 :
@@ -522,8 +526,10 @@ S_is_utf8_overlong(const U8 * const s, const STRLEN len)
522526 return (len < 2 )
523527 ? -1 /* This pattern encapsulates
524528 * F0 => 0x10; F8 => 0x08; FC => 0x04; FF => 0x02 */
525- : NATIVE_UTF8_TO_I8 (s [1 ]) < UTF_MIN_CONTINUATION_BYTE
526- + 0x100 - NATIVE_UTF8_TO_I8 (s [0 ]);
529+ : (NATIVE_UTF8_TO_I8 (s [1 ]) < UTF_MIN_CONTINUATION_BYTE
530+ + 0x100 - NATIVE_UTF8_TO_I8 (s [0 ]))
531+ ? 2
532+ : 0 ;
527533 case 0xFF :
528534 return isFF_overlong (s , len );
529535 }
@@ -533,11 +539,15 @@ PERL_STATIC_INLINE int
533539S_isFF_overlong (const U8 * const s , const STRLEN len )
534540{
535541 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
536- * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
537- * it isn't, and -1 if there isn't enough information to tell. This last
538- * return value can happen if the sequence is incomplete, missing some
539- * trailing bytes that would form a complete character. If there are
540- * enough bytes to make a definitive decision, this function does so. */
542+ * 'e' - 1 is an overlong beginning with \xFF. It returns a positive
543+ * number if it is; 0 if it isn't, and -1 if there isn't enough
544+ * information to tell. This last return value can happen if the sequence
545+ * is incomplete, missing some trailing bytes that would form a complete
546+ * character. If there are enough bytes to make a definitive decision,
547+ * this function does so.
548+ *
549+ * A positive return gives the number of bytes needed to be examined to
550+ * make the determination */
541551
542552 PERL_ARGS_ASSERT_ISFF_OVERLONG ;
543553
@@ -560,7 +570,7 @@ S_isFF_overlong(const U8 * const s, const STRLEN len)
560570 * be there; what comes after them doesn't matter. See tables in utf8.h,
561571 * utfebcdic.h. */
562572 if (len >= STRLENs (FF_OVERLONG_PREFIX )) {
563- return 1 ;
573+ return STRLENs ( FF_OVERLONG_PREFIX ) ;
564574 }
565575
566576 /* The missing bytes could cause the result to go one way or the other, so
0 commit comments