@@ -167,7 +167,7 @@ function nestedMfPropertyNamesFromClass($class) {
167167 }
168168 }
169169 }
170-
170+
171171 foreach ($ propertyNames as $ property => $ prefixes ) {
172172 $ propertyNames [$ property ] = array_unique ($ prefixes );
173173 }
@@ -338,14 +338,14 @@ public function __construct($input, $url = null, $jsonMode = false) {
338338 libxml_use_internal_errors (true );
339339 if (is_string ($ input )) {
340340 if (class_exists ('Masterminds \\HTML5 ' )) {
341- $ doc = new \Masterminds \HTML5 (array ('disable_html_ns ' => true ));
342- $ doc = $ doc ->loadHTML ($ input );
341+ $ doc = new \Masterminds \HTML5 (array ('disable_html_ns ' => true ));
342+ $ doc = $ doc ->loadHTML ($ input );
343343 } else {
344344 $ doc = new DOMDocument ();
345345 @$ doc ->loadHTML (unicodeToHtmlEntities ($ input ));
346346 }
347347 } elseif (is_a ($ input , 'DOMDocument ' )) {
348- $ doc = $ input ;
348+ $ doc = clone $ input ;
349349 } else {
350350 $ doc = new DOMDocument ();
351351 @$ doc ->loadHTML ('' );
@@ -402,7 +402,7 @@ private function isElementParsed(\DOMElement $e, $prefix) {
402402 if (!$ this ->parsed ->contains ($ e )) {
403403 return false ;
404404 }
405-
405+
406406 $ prefixes = $ this ->parsed [$ e ];
407407
408408 if (!in_array ($ prefix , $ prefixes )) {
@@ -443,101 +443,49 @@ private function resolveChildUrls(DOMElement $el) {
443443 }
444444 }
445445
446- public function textContent (DOMElement $ el ) {
447- $ excludeTags = array ('noframe ' , 'noscript ' , 'script ' , 'style ' , 'frames ' , 'frameset ' );
448-
449- if (isset ($ el ->tagName ) and in_array (strtolower ($ el ->tagName ), $ excludeTags )) {
450- return '' ;
451- }
452-
453- $ this ->resolveChildUrls ($ el );
454-
455- $ clonedEl = $ el ->cloneNode (true );
456-
457- foreach ($ this ->xpath ->query ('.//img ' , $ clonedEl ) as $ imgEl ) {
458- $ newNode = $ this ->doc ->createTextNode ($ imgEl ->getAttribute ($ imgEl ->hasAttribute ('alt ' ) ? 'alt ' : 'src ' ));
459- $ imgEl ->parentNode ->replaceChild ($ newNode , $ imgEl );
460- }
461-
462- foreach ($ excludeTags as $ tagName ) {
463- foreach ($ this ->xpath ->query (".// {$ tagName }" , $ clonedEl ) as $ elToRemove ) {
464- $ elToRemove ->parentNode ->removeChild ($ elToRemove );
465- }
466- }
467-
468- return $ this ->innerText ($ clonedEl );
446+ /**
447+ * The following two methods implements plain text parsing.
448+ * @see https://wiki.zegnat.net/media/textparsing.html
449+ **/
450+ public function textContent (DOMElement $ element )
451+ {
452+ return preg_replace (
453+ '/(^[\t\n\f\r ]+| +(?=\n)|(?<=\n) +| +(?= )|[\t\n\f\r ]+$)/ ' ,
454+ '' ,
455+ $ this ->elementToString ($ element )
456+ );
469457 }
470-
471- /**
472- * This method attempts to return a better 'innerText' representation than DOMNode::textContent
473- *
474- * @param DOMElement|DOMText $el
475- * @param bool $implied when parsing for implied name for h-*, rules may be slightly different
476- * @see: https://github.com/glennjones/microformat-shiv/blob/dev/lib/text.js
477- */
478- public function innerText ($ el , $ implied =false ) {
479- $ out = '' ;
480-
481- $ blockLevelTags = array ('h1 ' , 'h2 ' , 'h3 ' , 'h4 ' , 'h5 ' , 'h6 ' , 'p ' , 'hr ' , 'pre ' , 'table ' ,
482- 'address ' , 'article ' , 'aside ' , 'blockquote ' , 'caption ' , 'col ' , 'colgroup ' , 'dd ' , 'div ' ,
483- 'dt ' , 'dir ' , 'fieldset ' , 'figcaption ' , 'figure ' , 'footer ' , 'form ' , 'header ' , 'hgroup ' , 'hr ' ,
484- 'li ' , 'map ' , 'menu ' , 'nav ' , 'optgroup ' , 'option ' , 'section ' , 'tbody ' , 'testarea ' ,
485- 'tfoot ' , 'th ' , 'thead ' , 'tr ' , 'td ' , 'ul ' , 'ol ' , 'dl ' , 'details ' );
486-
487- $ excludeTags = array ('noframe ' , 'noscript ' , 'script ' , 'style ' , 'frames ' , 'frameset ' );
488-
489- // PHP DOMDocument doesn’t correctly handle whitespace around elements it doesn’t recognise.
490- $ unsupportedTags = array ('data ' );
491-
492- if (isset ($ el ->tagName )) {
493- if (in_array (strtolower ($ el ->tagName ), $ excludeTags )) {
494- return $ out ;
495- } else if ($ el ->tagName == 'img ' ) {
496- if ($ el ->hasAttribute ('alt ' )) {
497- return $ el ->getAttribute ('alt ' );
498- } else if (!$ implied && $ el ->hasAttribute ('src ' )) {
499- return $ this ->resolveUrl ($ el ->getAttribute ('src ' ));
500- }
501- } else if ($ el ->tagName == 'area ' and $ el ->hasAttribute ('alt ' )) {
502- return $ el ->getAttribute ('alt ' );
503- } else if ($ el ->tagName == 'abbr ' and $ el ->hasAttribute ('title ' )) {
504- return $ el ->getAttribute ('title ' );
505- }
506- }
507-
508- // if node is a text node get its text
509- if (isset ($ el ->nodeType ) && $ el ->nodeType === 3 ) {
510- $ out .= $ el ->textContent ;
511- }
512-
513- // get the text of the child nodes
514- if ($ el ->childNodes && $ el ->childNodes ->length > 0 ) {
515- for ($ j = 0 ; $ j < $ el ->childNodes ->length ; $ j ++) {
516- $ text = $ this ->innerText ($ el ->childNodes ->item ($ j ), $ implied );
517- if (!is_null ($ text )) {
518- $ out .= $ text ;
519- }
520- }
521- }
522-
523- if (isset ($ el ->tagName )) {
524- // if its a block level tag add an additional space at the end
525- if (in_array (strtolower ($ el ->tagName ), $ blockLevelTags )) {
526- $ out .= ' ' ;
527- } elseif ($ implied and in_array (strtolower ($ el ->tagName ), $ unsupportedTags )) {
528- $ out .= ' ' ;
529- } else if (strtolower ($ el ->tagName ) == 'br ' ) {
530- // else if its a br, replace with newline
531- $ out .= "\n" ;
458+ private function elementToString (DOMElement $ input )
459+ {
460+ $ output = '' ;
461+ foreach ($ input ->childNodes as $ child ) {
462+ if ($ child ->nodeType === XML_TEXT_NODE ) {
463+ $ output .= str_replace (array ("\t" , "\n" , "\r" ) , ' ' , $ child ->textContent );
464+ } else if ($ child ->nodeType === XML_ELEMENT_NODE ) {
465+ $ tagName = strtoupper ($ child ->tagName );
466+ if (in_array ($ tagName , array ('SCRIPT ' , 'STYLE ' ))) {
467+ continue ;
468+ } else if ($ tagName === 'IMG ' ) {
469+ if ($ child ->hasAttribute ('alt ' )) {
470+ $ output .= ' ' . trim ($ child ->getAttribute ('alt ' ), "\t\n\f\r " ) . ' ' ;
471+ } else if ($ child ->hasAttribute ('src ' )) {
472+ $ output .= ' ' . $ this ->resolveUrl (trim ($ child ->getAttribute ('src ' ), "\t\n\f\r " )) . ' ' ;
473+ }
474+ } else if ($ tagName === 'BR ' ) {
475+ $ output .= "\n" ;
476+ } else if ($ tagName === 'P ' ) {
477+ $ output .= "\n" . $ this ->elementToString ($ child );
478+ } else {
479+ $ output .= $ this ->elementToString ($ child );
480+ }
481+ }
532482 }
533- }
534-
535- return ($ out === '' ) ? NULL : $ out ;
483+ return $ output ;
536484 }
537485
538486 /**
539487 * This method parses the language of an element
540- * @param DOMElement $el
488+ * @param DOMElement $el
541489 * @access public
542490 * @return string
543491 */
@@ -547,7 +495,7 @@ public function language(DOMElement $el)
547495 if ($ el ->hasAttribute ('lang ' )) {
548496 return unicodeTrim ($ el ->getAttribute ('lang ' ));
549497 }
550-
498+
551499 if ($ el ->tagName == 'html ' ) {
552500 // we're at the <html> element and no lang; check <meta> http-equiv Content-Language
553501 foreach ( $ this ->xpath ->query ('.//meta[@http-equiv] ' ) as $ node )
@@ -558,7 +506,7 @@ public function language(DOMElement $el)
558506 }
559507 } elseif ($ el ->parentNode instanceof DOMElement) {
560508 // check the parent node
561- return $ this ->language ($ el ->parentNode );
509+ return $ this ->language ($ el ->parentNode );
562510 }
563511
564512 return '' ;
@@ -648,7 +596,7 @@ public function parseP(\DOMElement $p) {
648596 } elseif (in_array ($ p ->tagName , array ('data ' , 'input ' )) and $ p ->hasAttribute ('value ' )) {
649597 $ pValue = $ p ->getAttribute ('value ' );
650598 } else {
651- $ pValue = unicodeTrim ( $ this ->innerText ($ p) );
599+ $ pValue = $ this ->textContent ($ p );
652600 }
653601
654602 return $ pValue ;
@@ -670,23 +618,16 @@ public function parseU(\DOMElement $u) {
670618 $ uValue = $ u ->getAttribute ('poster ' );
671619 } elseif ($ u ->tagName == 'object ' and $ u ->hasAttribute ('data ' )) {
672620 $ uValue = $ u ->getAttribute ('data ' );
673- }
674-
675- if (isset ($ uValue )) {
676- return $ this ->resolveUrl ($ uValue );
677- }
678-
679- $ classTitle = $ this ->parseValueClassTitle ($ u );
680-
681- if ($ classTitle !== null ) {
682- return $ classTitle ;
621+ } elseif (($ classTitle = $ this ->parseValueClassTitle ($ u )) !== null ) {
622+ $ uValue = $ classTitle ;
683623 } elseif (($ u ->tagName == 'abbr ' or $ u ->tagName == 'link ' ) and $ u ->hasAttribute ('title ' )) {
684- return $ u ->getAttribute ('title ' );
624+ $ uValue = $ u ->getAttribute ('title ' );
685625 } elseif (in_array ($ u ->tagName , array ('data ' , 'input ' )) and $ u ->hasAttribute ('value ' )) {
686- return $ u ->getAttribute ('value ' );
626+ $ uValue = $ u ->getAttribute ('value ' );
687627 } else {
688- return unicodeTrim ( $ this ->textContent ($ u) );
628+ $ uValue = $ this ->textContent ($ u );
689629 }
630+ return $ this ->resolveUrl ($ uValue );
690631 }
691632
692633 /**
@@ -861,7 +802,7 @@ public function parseDT(\DOMElement $dt, &$dates = array(), &$impliedTimezone =
861802
862803 $ dtValue = unicodeTrim ($ dtValue );
863804
864- // Store the date part so that we can use it when assembling the final timestamp if the next one is missing a date part
805+ // Store the date part so that we can use it when assembling the final timestamp if the next one is missing a date part
865806 if (preg_match ('/(\d{4}-\d{2}-\d{2})/ ' , $ dtValue , $ matches )) {
866807 $ dates [] = $ matches [0 ];
867808 }
@@ -912,11 +853,13 @@ public function parseE(\DOMElement $e) {
912853 }
913854 $ html = $ e ->ownerDocument ->saveHtml ($ innerNodes );
914855 // Put the nodes back in place.
915- $ e ->appendChild ($ innerNodes );
856+ if ($ innerNodes ->hasChildNodes ()) {
857+ $ e ->appendChild ($ innerNodes );
858+ }
916859
917860 $ return = array (
918861 'html ' => unicodeTrim ($ html ),
919- 'value ' => unicodeTrim ( $ this ->innerText ($ e) ),
862+ 'value ' => $ this ->textContent ($ e ),
920863 );
921864
922865 if ($ this ->lang ) {
@@ -970,7 +913,7 @@ public function parseH(\DOMElement $e, $is_backcompat = false, $has_nested_mf =
970913
971914 // Handle p-*
972915 foreach ($ this ->xpath ->query ('.//*[contains(concat(" ", @class) ," p-")] ' , $ e ) as $ p ) {
973- // element is already parsed
916+ // element is already parsed
974917 if ($ this ->isElementParsed ($ p , 'p ' )) {
975918 continue ;
976919 // backcompat parsing and element was not upgraded; skip it
@@ -1123,7 +1066,7 @@ public function parseH(\DOMElement $e, $is_backcompat = false, $has_nested_mf =
11231066 }
11241067 }
11251068
1126- throw new Exception ($ this ->innerText ($ e , true ));
1069+ throw new Exception ($ this ->textContent ($ e , true ));
11271070 } catch (Exception $ exc ) {
11281071 $ return ['name ' ][] = unicodeTrim ($ exc ->getMessage ());
11291072 }
@@ -1175,6 +1118,11 @@ public function parseH(\DOMElement $e, $is_backcompat = false, $has_nested_mf =
11751118 $ mfTypes = array_unique ($ mfTypes );
11761119 sort ($ mfTypes );
11771120
1121+ // Properties should be an object when JSON serialised
1122+ if (empty ($ return ) and $ this ->jsonMode ) {
1123+ $ return = new stdClass ();
1124+ }
1125+
11781126 // Phew. Return the final result.
11791127 $ parsed = array (
11801128 'type ' => $ mfTypes ,
@@ -1218,8 +1166,8 @@ public function parseImpliedPhoto(\DOMElement $e) {
12181166 $ xpaths = array (
12191167 './img ' ,
12201168 './object ' ,
1221- './*[count(preceding-sibling::* )+count(following-sibling::* )=0]/img ' ,
1222- './*[count(preceding-sibling::* )+count(following-sibling::* )=0]/object ' ,
1169+ './*[not(contains(concat(" ", @class), " h-"))]/img[ count(preceding-sibling::img )+count(following-sibling::img )=0] ' ,
1170+ './*[not(contains(concat(" ", @class), " h-"))]/object[ count(preceding-sibling::object )+count(following-sibling::object )=0] ' ,
12231171 );
12241172
12251173 foreach ($ xpaths as $ path ) {
@@ -1351,7 +1299,7 @@ public function parseRelsAndAlternates() {
13511299
13521300 /**
13531301 * Find rel=tag elements that don't have class=category and have an href.
1354- * For each element, get the last non-empty URL segment. Append a <data>
1302+ * For each element, get the last non-empty URL segment. Append a <data>
13551303 * element with that value as the category. Uses the mf1 class 'category'
13561304 * which will then be upgraded to p-category during backcompat.
13571305 * @param DOMElement $el
@@ -1553,6 +1501,8 @@ public function backcompat(DOMElement $el, $context = '', $isParentMf2 = false)
15531501 $ mf1Classes = array_intersect ($ classes , array_keys ($ this ->classicRootMap ));
15541502 }
15551503
1504+ $ elHasMf2 = $ this ->hasRootMf2 ($ el );
1505+
15561506 foreach ($ mf1Classes as $ classname ) {
15571507 // special handling for specific properties
15581508 switch ( $ classname )
@@ -1647,7 +1597,7 @@ public function backcompat(DOMElement $el, $context = '', $isParentMf2 = false)
16471597 }
16481598 }
16491599
1650- if ( empty ($ context ) && isset ($ this ->classicRootMap [$ classname ]) && !$ this -> hasRootMf2 ( $ el ) ) {
1600+ if ( empty ($ context ) && isset ($ this ->classicRootMap [$ classname ]) && !$ elHasMf2 ) {
16511601 $ this ->addMfClasses ($ el , $ this ->classicRootMap [$ classname ]);
16521602 }
16531603 }
@@ -2155,8 +2105,8 @@ function resolveUrl($baseURI, $referenceURI) {
21552105
21562106 # 5.2.1 Pre-parse the Base URI
21572107 # The base URI (Base) is established according to the procedure of
2158- # Section 5.1 and parsed into the five main components described in
2159- # Section 3
2108+ # Section 5.1 and parsed into the five main components described in
2109+ # Section 3
21602110 $ base = parseUriToComponents ($ baseURI );
21612111
21622112 # If base path is blank (http://example.com) then set it to /
0 commit comments