@@ -23,6 +23,8 @@ class Html2Text
2323{
2424 const ENCODING = 'UTF-8 ' ;
2525
26+ protected $ htmlFuncFlags ;
27+
2628 /**
2729 * Contains the HTML content to convert.
2830 *
@@ -236,6 +238,9 @@ public function __construct($html = '', $options = array())
236238
237239 $ this ->html = $ html ;
238240 $ this ->options = array_merge ($ this ->options , $ options );
241+ $ this ->htmlFuncFlags = (PHP_VERSION_ID < 50400 )
242+ ? ENT_COMPAT
243+ : ENT_COMPAT | ENT_HTML5 ;
239244 }
240245
241246 /**
@@ -318,6 +323,16 @@ public function set_base_url($baseurl)
318323 }
319324
320325 protected function convert ()
326+ {
327+ $ origEncoding = mb_internal_encoding ();
328+ mb_internal_encoding (self ::ENCODING );
329+
330+ $ this ->doConvert ();
331+
332+ mb_internal_encoding ($ origEncoding );
333+ }
334+
335+ protected function doConvert ()
321336 {
322337 $ this ->linkList = array ();
323338
@@ -345,7 +360,7 @@ protected function converter(&$text)
345360 $ text = preg_replace_callback ($ this ->callbackSearch , array ($ this , 'pregCallback ' ), $ text );
346361 $ text = strip_tags ($ text );
347362 $ text = preg_replace ($ this ->entSearch , $ this ->entReplace , $ text );
348- $ text = html_entity_decode ($ text , ENT_QUOTES , self ::ENCODING );
363+ $ text = html_entity_decode ($ text , $ this -> htmlFuncFlags , self ::ENCODING );
349364
350365 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
351366 $ text = preg_replace ('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/ ' , '' , $ text );
@@ -395,7 +410,7 @@ protected function buildlinkList($link, $display, $linkOverride = null)
395410 $ url = $ link ;
396411 } else {
397412 $ url = $ this ->baseurl ;
398- if (substr ($ link , 0 , 1 ) != '/ ' ) {
413+ if (mb_substr ($ link , 0 , 1 ) != '/ ' ) {
399414 $ url .= '/ ' ;
400415 }
401416 $ url .= $ link ;
@@ -472,7 +487,7 @@ protected function convertBlockquotes(&$text)
472487 $ end = $ m [1 ];
473488 $ len = $ end - $ taglen - $ start ;
474489 // Get blockquote content
475- $ body = substr ($ text , $ start + $ taglen - $ diff , $ len );
490+ $ body = mb_substr ($ text , $ start + $ taglen - $ diff , $ len );
476491
477492 // Set text width
478493 $ pWidth = $ this ->options ['width ' ];
@@ -482,20 +497,21 @@ protected function convertBlockquotes(&$text)
482497 $ this ->converter ($ body );
483498 // Add citation markers and create PRE block
484499 $ body = preg_replace ('/((^|\n)>*)/ ' , '\\1> ' , trim ($ body ));
485- $ body = '<pre> ' . htmlspecialchars ($ body ) . '</pre> ' ;
500+ $ body = '<pre> ' . htmlspecialchars ($ body, $ this -> htmlFuncFlags , self :: ENCODING ) . '</pre> ' ;
486501 // Re-set text width
487502 $ this ->options ['width ' ] = $ pWidth ;
488503 // Replace content
489- $ text = substr ($ text , 0 , $ start - $ diff )
490- . $ body . substr ($ text , $ end + strlen ($ m [0 ]) - $ diff );
504+ $ text = mb_substr ($ text , 0 , $ start - $ diff )
505+ . $ body
506+ . mb_substr ($ text , $ end + mb_strlen ($ m [0 ]) - $ diff );
491507
492- $ diff += $ len + $ taglen + strlen ($ m [0 ]) - strlen ($ body );
508+ $ diff += $ len + $ taglen + mb_strlen ($ m [0 ]) - mb_strlen ($ body );
493509 unset($ body );
494510 }
495511 } else {
496512 if ($ level == 0 ) {
497513 $ start = $ m [1 ];
498- $ taglen = strlen ($ m [0 ]);
514+ $ taglen = mb_strlen ($ m [0 ]);
499515 }
500516 $ level ++;
501517 }
@@ -511,7 +527,7 @@ protected function convertBlockquotes(&$text)
511527 */
512528 protected function pregCallback ($ matches )
513529 {
514- switch (strtolower ($ matches [1 ])) {
530+ switch (mb_strtolower ($ matches [1 ])) {
515531 case 'p ' :
516532 // Replace newlines with spaces.
517533 $ para = str_replace ("\n" , " " , $ matches [3 ]);
@@ -585,15 +601,9 @@ protected function toupper($str)
585601 */
586602 protected function strtoupper ($ str )
587603 {
588- $ str = html_entity_decode ($ str , ENT_COMPAT , self ::ENCODING );
589-
590- if (function_exists ('mb_strtoupper ' )) {
591- $ str = mb_strtoupper ($ str , self ::ENCODING );
592- } else {
593- $ str = strtoupper ($ str );
594- }
595-
596- $ str = htmlspecialchars ($ str , ENT_COMPAT , self ::ENCODING );
604+ $ str = html_entity_decode ($ str , $ this ->htmlFuncFlags , self ::ENCODING );
605+ $ str = mb_strtoupper ($ str );
606+ $ str = htmlspecialchars ($ str , $ this ->htmlFuncFlags , self ::ENCODING );
597607
598608 return $ str ;
599609 }
0 commit comments