@@ -755,100 +755,117 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
755755
756756PyObject  * PyCodec_XMLCharRefReplaceErrors (PyObject  * exc )
757757{
758-     if  (PyObject_TypeCheck (exc , (PyTypeObject  * )PyExc_UnicodeEncodeError )) {
759-         PyObject  * restuple ;
760-         PyObject  * object ;
761-         Py_ssize_t  i ;
762-         Py_ssize_t  start ;
763-         Py_ssize_t  end ;
764-         PyObject  * res ;
765-         Py_UCS1  * outp ;
766-         Py_ssize_t  ressize ;
767-         Py_UCS4  ch ;
768-         if  (PyUnicodeEncodeError_GetStart (exc , & start ))
769-             return  NULL ;
770-         if  (PyUnicodeEncodeError_GetEnd (exc , & end ))
771-             return  NULL ;
772-         if  (!(object  =  PyUnicodeEncodeError_GetObject (exc )))
773-             return  NULL ;
774-         if  (end  -  start  >  PY_SSIZE_T_MAX  / (2 + 7 + 1 ))
775-             end  =  start  +  PY_SSIZE_T_MAX  / (2 + 7 + 1 );
776-         for  (i  =  start , ressize  =  0 ; i  <  end ; ++ i ) {
777-             /* object is guaranteed to be "ready" */ 
778-             ch  =  PyUnicode_READ_CHAR (object , i );
779-             if  (ch < 10 )
780-                 ressize  +=  2 + 1 + 1 ;
781-             else  if  (ch < 100 )
782-                 ressize  +=  2 + 2 + 1 ;
783-             else  if  (ch < 1000 )
784-                 ressize  +=  2 + 3 + 1 ;
785-             else  if  (ch < 10000 )
786-                 ressize  +=  2 + 4 + 1 ;
787-             else  if  (ch < 100000 )
788-                 ressize  +=  2 + 5 + 1 ;
789-             else  if  (ch < 1000000 )
790-                 ressize  +=  2 + 6 + 1 ;
791-             else 
792-                 ressize  +=  2 + 7 + 1 ;
758+     if  (!PyObject_TypeCheck (exc , (PyTypeObject  * )PyExc_UnicodeEncodeError )) {
759+         wrong_exception_type (exc );
760+         return  NULL ;
761+     }
762+ 
763+     Py_ssize_t  start , end ;
764+     if  (PyUnicodeEncodeError_GetStart (exc , & start )) {
765+         return  NULL ;
766+     }
767+     if  (PyUnicodeEncodeError_GetEnd (exc , & end )) {
768+         return  NULL ;
769+     }
770+     if  (end  <= start ) {
771+         // gh-12337 will handle negative end or start (for now we crash) 
772+         return  Py_BuildValue ("(Nn)" , Py_GetConstant (Py_CONSTANT_EMPTY_STR ), end );
773+     }
774+ 
775+     PyObject  * obj  =  PyUnicodeEncodeError_GetObject (exc );
776+     if  (obj  ==  NULL ) {
777+         return  NULL ;
778+     }
779+ 
780+     if  (end  -  start  >  PY_SSIZE_T_MAX  / 10 ) {
781+         end  =  start  +  PY_SSIZE_T_MAX  / 10 ;
782+     }
783+ 
784+     end  =  Py_MIN (end , PyUnicode_GET_LENGTH (obj ));
785+ 
786+     Py_ssize_t  ressize  =  0 ;
787+     for  (Py_ssize_t  i  =  start ; i  <  end ; ++ i ) {
788+         /* object is guaranteed to be "ready" */ 
789+         Py_UCS4  ch  =  PyUnicode_READ_CHAR (obj , i );
790+         // The number of characters that each character 'ch' contributes 
791+         // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}. 
792+         if  (ch  <  10 ) {
793+             ressize  +=  4 ;
793794        }
794-         /* allocate replacement */ 
795-         res  =  PyUnicode_New (ressize , 127 );
796-         if  (res  ==  NULL ) {
797-             Py_DECREF (object );
798-             return  NULL ;
795+         else  if  (ch  <  100 ) {
796+             ressize  +=  5 ;
799797        }
800-         outp  =  PyUnicode_1BYTE_DATA (res );
801-         /* generate replacement */ 
802-         for  (i  =  start ; i  <  end ; ++ i ) {
803-             int  digits ;
804-             int  base ;
805-             ch  =  PyUnicode_READ_CHAR (object , i );
806-             * outp ++  =  '&' ;
807-             * outp ++  =  '#' ;
808-             if  (ch < 10 ) {
809-                 digits  =  1 ;
810-                 base  =  1 ;
811-             }
812-             else  if  (ch < 100 ) {
813-                 digits  =  2 ;
814-                 base  =  10 ;
815-             }
816-             else  if  (ch < 1000 ) {
817-                 digits  =  3 ;
818-                 base  =  100 ;
819-             }
820-             else  if  (ch < 10000 ) {
821-                 digits  =  4 ;
822-                 base  =  1000 ;
823-             }
824-             else  if  (ch < 100000 ) {
825-                 digits  =  5 ;
826-                 base  =  10000 ;
827-             }
828-             else  if  (ch < 1000000 ) {
829-                 digits  =  6 ;
830-                 base  =  100000 ;
831-             }
832-             else  {
833-                 digits  =  7 ;
834-                 base  =  1000000 ;
835-             }
836-             while  (digits -- > 0 ) {
837-                 * outp ++  =  '0'  +  ch /base ;
838-                 ch  %= base ;
839-                 base  /= 10 ;
840-             }
841-             * outp ++  =  ';' ;
798+         else  if  (ch  <  1000 ) {
799+             ressize  +=  6 ;
800+         }
801+         else  if  (ch  <  10000 ) {
802+             ressize  +=  7 ;
803+         }
804+         else  if  (ch  <  100000 ) {
805+             ressize  +=  8 ;
806+         }
807+         else  if  (ch  <  1000000 ) {
808+             ressize  +=  9 ;
809+         }
810+         else  {
811+             assert (ch  <  10000000 );
812+             ressize  +=  10 ;
842813        }
843-         assert (_PyUnicode_CheckConsistency (res , 1 ));
844-         restuple  =  Py_BuildValue ("(Nn)" , res , end );
845-         Py_DECREF (object );
846-         return  restuple ;
847814    }
848-     else  {
849-         wrong_exception_type (exc );
815+ 
816+     /* allocate replacement */ 
817+     PyObject  * res  =  PyUnicode_New (ressize , 127 );
818+     if  (res  ==  NULL ) {
819+         Py_DECREF (obj );
850820        return  NULL ;
851821    }
822+     Py_UCS1  * outp  =  PyUnicode_1BYTE_DATA (res );
823+     /* generate replacement */ 
824+     for  (Py_ssize_t  i  =  start ; i  <  end ; ++ i ) {
825+         int  digits , base ;
826+         Py_UCS4  ch  =  PyUnicode_READ_CHAR (obj , i );
827+         if  (ch  <  10 ) {
828+             digits  =  1 ;
829+             base  =  1 ;
830+         }
831+         else  if  (ch  <  100 ) {
832+             digits  =  2 ;
833+             base  =  10 ;
834+         }
835+         else  if  (ch  <  1000 ) {
836+             digits  =  3 ;
837+             base  =  100 ;
838+         }
839+         else  if  (ch  <  10000 ) {
840+             digits  =  4 ;
841+             base  =  1000 ;
842+         }
843+         else  if  (ch  <  100000 ) {
844+             digits  =  5 ;
845+             base  =  10000 ;
846+         }
847+         else  if  (ch  <  1000000 ) {
848+             digits  =  6 ;
849+             base  =  100000 ;
850+         }
851+         else  {
852+             assert (ch  <  10000000 );
853+             digits  =  7 ;
854+             base  =  1000000 ;
855+         }
856+         * outp ++  =  '&' ;
857+         * outp ++  =  '#' ;
858+         while  (digits --  >  0 ) {
859+             * outp ++  =  '0'  +  ch  / base ;
860+             ch  %= base ;
861+             base  /= 10 ;
862+         }
863+         * outp ++  =  ';' ;
864+     }
865+     assert (_PyUnicode_CheckConsistency (res , 1 ));
866+     PyObject  * restuple  =  Py_BuildValue ("(Nn)" , res , end );
867+     Py_DECREF (obj );
868+     return  restuple ;
852869}
853870
854871PyObject  * PyCodec_BackslashReplaceErrors (PyObject  * exc )
0 commit comments