Skip to content
This repository was archived by the owner on Aug 14, 2021. It is now read-only.

Commit 10c528c

Browse files
authored
Merge pull request #18 from andreskrey/development
Prepare for release v0.2.0
2 parents 2a493bc + a2d10aa commit 10c528c

File tree

31 files changed

+889
-4401
lines changed

31 files changed

+889
-4401
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ php:
77
- "5.5"
88
- "5.6"
99
- "7.0"
10-
- "hhvm"
10+
- "7.1"
1111

1212
sudo: false

CHANGELOG.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,25 @@ All notable changes to this project will be documented in this file.
33

44
## Unreleased
55

6+
## [v0.2.0](https://github.com/andreskrey/readability.php/releases/tag/v0.2.0)
7+
8+
We ARE a 100% complete por of Readability.js!
9+
- Every test unit passes
10+
- Readability.php produces the same exact output as Readability.js
11+
- I'm happy :)
12+
13+
### Fixed
14+
- Lots of bugs
15+
- Merged PR by DavidFricker to avoid exceptions while grabbing the document content
16+
17+
### Added
18+
- substituteEntities flag, to avoid replacing especial characters with HTML entities. There's nothing we can do about ` `, that entity is replaced by libxml and there's no way to disable it.
19+
- Named data sets so it's easier to detect which test case is failing.
20+
21+
### Removed
22+
23+
- Couple of test cases that involved broken JS. There's nothing we can do about JS spilling onto the text.
24+
625
## [0.0.3-alpha](https://github.com/andreskrey/readability.php/releases/tag/v0.0.3v-alpha)
726

827
We are getting closer to be a 100% complete port of Readability.js!

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ If the parsing process was unsuccessful the HTMLParser will return `false`
5151
- **weightClasses**: default value `true`, weight classes during the rating phase.
5252
- **removeReadabilityTags**: default value `true`, remove the data-readability tags inside the nodes that are added during the rating phase.
5353
- **fixRelativeURLs**: default value `false`, convert relative URLs to absolute. Like `/test` to `http://host/test`.
54-
- **originalURL**: default value `http://fakehost`, original URL from the article used to fix relative URLs.
54+
- **substituteEntities**: default value `false`, disables the `substituteEntities` flag of libxml. Will avoid substituting HTML entities. Like `´` to á.
55+
- **originalURL**: default value `http://fakehost`, original URL from the article used to fix relative URLs.
5556

5657
## Limitations
5758

src/HTMLParser.php

Lines changed: 65 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,6 @@ class HTMLParser
2626
*/
2727
private $metadata = [];
2828

29-
/**
30-
* @var array
31-
*/
32-
private $title = [];
33-
3429
/**
3530
* @var array
3631
*/
@@ -40,7 +35,7 @@ class HTMLParser
4035
'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
4136
'byline' => '/byline|author|dateline|writtenby|p-author/i',
4237
'replaceFonts' => '/<(\/?)font[^>]*>/gi',
43-
'normalize' => '/\s{2,}/g',
38+
'normalize' => '/\s{2,}/',
4439
'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i',
4540
'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
4641
'prevLink' => '/(prev|earl|old|new|<|«)/i',
@@ -104,6 +99,7 @@ public function __construct(array $options = [])
10499
'weightClasses' => true,
105100
'removeReadabilityTags' => true,
106101
'fixRelativeURLs' => false,
102+
'substituteEntities' => true,
107103
'originalURL' => 'http://fakehost',
108104
];
109105

@@ -137,7 +133,9 @@ public function parse($html)
137133

138134
$this->metadata = $this->getMetadata();
139135

140-
$this->title = $this->getTitle();
136+
$this->metadata['image'] = $this->getMainImage();
137+
138+
$this->metadata['title'] = $this->getTitle();
141139

142140
// Checking for minimum HTML to work with.
143141
if (!($root = $this->dom->getElementsByTagName('body')->item(0))) {
@@ -162,7 +160,11 @@ public function parse($html)
162160

163161
// TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff
164162
// that doesn't really count to the quality of the result.
165-
if ($result && mb_strlen($result->textContent) < 500) {
163+
$length = 0;
164+
foreach ($result->getElementsByTagName('p') as $p) {
165+
$length += mb_strlen($p->textContent);
166+
}
167+
if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) {
166168
$root = $this->backupdom->getElementsByTagName('body')->item(0);
167169

168170
if ($this->getConfig()->getOption('stripUnlikelyCandidates')) {
@@ -205,6 +207,11 @@ public function parse($html)
205207
*/
206208
private function loadHTML($html)
207209
{
210+
if (!$this->getConfig()->getOption('substituteEntities')) {
211+
// Keep the original HTML entities
212+
$this->dom->substituteEntities = false;
213+
}
214+
208215
// Prepend the XML tag to avoid having issues with special characters. Should be harmless.
209216
$this->dom->loadHTML('<?xml encoding="UTF-8">' . $html);
210217
$this->dom->encoding = 'UTF-8';
@@ -293,6 +300,15 @@ private function prepDocument()
293300
}
294301
}
295302
}
303+
304+
// Replace font tags with span
305+
$fonts = $this->dom->getElementsByTagName('font');
306+
$length = $fonts->length;
307+
for ($i = 0; $i < $length; $i++) {
308+
$font = $fonts->item($length - 1 - $i);
309+
$span = new Readability($font);
310+
$span->setNodeTag('span', true);
311+
}
296312
}
297313

298314
public function postProcessContent(DOMDocument $article)
@@ -436,11 +452,39 @@ private function getMetadata()
436452

437453
if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) {
438454
$metadata['image'] = ($values['og:image']) ? $values['og:image'] : $values['twitter:image'];
455+
} else {
456+
$metadata['image'] = null;
439457
}
440458

441459
return $metadata;
442460
}
443461

462+
/**
463+
* Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
464+
* find a correct image.
465+
*
466+
* @return bool|string URL of the top image or false if unsuccessful.
467+
*/
468+
public function getMainImage()
469+
{
470+
if ($this->metadata['image'] !== null) {
471+
return $this->metadata['image'];
472+
}
473+
474+
foreach ($this->dom->getElementsByTagName('link') as $link) {
475+
/** @var \DOMElement $link */
476+
/*
477+
* Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and
478+
* finally check for the existence of the href attribute, which should hold the image url.
479+
*/
480+
if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) {
481+
return $link->getAttribute('href');
482+
}
483+
}
484+
485+
return false;
486+
}
487+
444488
/**
445489
* Get the density of links as a percentage of the content
446490
* This is the amount of text that is inside a link divided by the total text in the node.
@@ -493,6 +537,7 @@ private function getTitle()
493537
* Gets nodes from the root element.
494538
*
495539
* @param $node Readability
540+
*
496541
* @return array
497542
*/
498543
private function getNodes(Readability $node)
@@ -586,7 +631,7 @@ private function rateNodes($nodes)
586631
continue;
587632
}
588633
// Discard nodes with less than 25 characters, without blank space
589-
if (mb_strlen($node->getValue(true)) < 25) {
634+
if (mb_strlen($node->getTextContent(true)) < 25) {
590635
continue;
591636
}
592637

@@ -601,10 +646,10 @@ private function rateNodes($nodes)
601646
$contentScore = 1;
602647

603648
// Add points for any commas within this paragraph.
604-
$contentScore += count(explode(',', $node->getValue(true)));
649+
$contentScore += count(explode(',', $node->getTextContent(true)));
605650

606651
// For every 100 characters in this paragraph, add another point. Up to 3 points.
607-
$contentScore += min(floor(mb_strlen($node->getValue(true)) / 100), 3);
652+
$contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3);
608653

609654
// Initialize and score ancestors.
610655
/** @var Readability $ancestor */
@@ -1066,13 +1111,20 @@ public function removeNode(\DOMNode $node)
10661111
* Checks if the node is a byline.
10671112
*
10681113
* @param Readability $node
1069-
* @param string $matchString
1114+
* @param string $matchString
10701115
*
10711116
* @return bool
10721117
*/
10731118
private function checkByline($node, $matchString)
10741119
{
1075-
if ($this->getConfig()->getOption('articleByLine')) {
1120+
if (!$this->getConfig()->getOption('articleByLine')) {
1121+
return false;
1122+
}
1123+
1124+
/*
1125+
* Check if the byline is already set
1126+
*/
1127+
if (isset($this->metadata['byline'])) {
10761128
return false;
10771129
}
10781130

src/Readability.php

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,9 @@ public function getTextContent($normalize = false)
286286
* element with the new tag name and importing it to the main DOMDocument.
287287
*
288288
* @param string $value
289+
* @param bool $importAttributes
289290
*/
290-
public function setNodeTag($value)
291+
public function setNodeTag($value, $importAttributes = false)
291292
{
292293
$new = new \DOMDocument();
293294
$new->appendChild($new->createElement($value));
@@ -298,6 +299,13 @@ public function setNodeTag($value)
298299
$new->firstChild->appendChild($import);
299300
}
300301

302+
if ($importAttributes) {
303+
// Import attributes from the original node.
304+
foreach ($this->node->attributes as $attribute) {
305+
$new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue);
306+
}
307+
}
308+
301309
// The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement.
302310
$import = $this->node->ownerDocument->importNode($new->firstChild, true);
303311
$this->node->parentNode->replaceChild($import, $this->node);

test/HTMLParserTest.php

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,18 @@ class HTMLParserTest extends \PHPUnit_Framework_TestCase
99
/**
1010
* @dataProvider getSamplePages
1111
*/
12-
public function testHTMLParserParsesHTML($html, $expectedResult, $expectedMetadata)
12+
public function testHTMLParserParsesHTML($html, $expectedResult, $expectedMetadata, $config)
1313
{
14-
$readability = new HTMLParser([
15-
'originalURL' => 'http://fakehost/test/test.html',
16-
]);
14+
$options = ['originalURL' => 'http://fakehost/test/test.html',
15+
'fixRelativeURLs' => true,
16+
'substituteEntities' => true,
17+
];
18+
19+
if ($config) {
20+
$options = $config;
21+
}
22+
23+
$readability = new HTMLParser($options);
1724
$result = $readability->parse($html);
1825

1926
$this->assertEquals($expectedResult, $result['html']);
@@ -33,8 +40,12 @@ public function getSamplePages()
3340
$source = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'source.html');
3441
$expectedHTML = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected.html');
3542
$expectedMetadata = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected-metadata.json');
43+
$config = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'config.json');
44+
if ($config) {
45+
$config = json_decode($config, true);
46+
}
3647

37-
$pages[] = [$source, $expectedHTML, $expectedMetadata];
48+
$pages[$testPage] = [$source, $expectedHTML, $expectedMetadata, $config];
3849
}
3950

4051
return $pages;

test/ReadabilityTest.php

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,9 @@
44

55
class ReadabilityTest extends \PHPUnit_Framework_TestCase
66
{
7+
public function testDummy()
8+
{
9+
//TODO
10+
$this->assertEquals(true, true);
11+
}
712
}

test/test-pages/ars-1/config.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"articleByLine": true
3+
}

test/test-pages/clean-links/expected.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<div><td>
22
<h3 align="center ">Study Webtext</h3>
3-
<h2 align="center "><font color="Maroon&#xA; " face="Lucida Handwriting ">"Bartleby the Scrivener: A Story of Wall-Street " </font>(1853) <br></br>
3+
<h2 align="center "><span color="Maroon&#xA; " face="Lucida Handwriting ">"Bartleby the Scrivener: A Story of Wall-Street " </span>(1853) <br></br>
44
Herman Melville</h2>
55
<h2 align="center "><a href="http://www.vcu.edu/engweb/webtexts/bartleby.html&#xA; " target="_blank "><img align="absmiddle " alt="To the story text without notes&#xA; " height="38 " src="http://fakehost/test/hmhome.gif " width="38 "></img></a>
66
</h2>

test/test-pages/ehow-1/expected-metadata.json

Lines changed: 0 additions & 6 deletions
This file was deleted.

0 commit comments

Comments
 (0)