Skip to content

Commit bff11d5

Browse files
iparamonaunleush
authored andcommitted
Review the use of readability and articlebody (#606)
* review the use of readability and articlebody * fix conditions * validate that articlebody is not empty
1 parent 465c64f commit bff11d5

File tree

9 files changed

+55
-24
lines changed

9 files changed

+55
-24
lines changed

config.local.js.SAMPLE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,8 +223,8 @@ export default {
223223
// dnt: true,
224224
cache_ttl: 100 * 365 * 24 * 3600 // 100 Years.
225225
},
226-
readability: {
227-
enabled: false
226+
app: {
227+
// allow_readability: true
228228
// allowPTagDescription: true // to enable description fallback to first paragraph
229229
},
230230
images: {

lib/core.js

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,7 +1085,7 @@
10851085
return hasDomainData;
10861086
}
10871087

1088-
const BIG_CONTEXT = ['htmlparser', 'readability', 'decode'];
1088+
const BIG_CONTEXT = ['htmlparser', 'readability', 'decode', 'cheerio'];
10891089

10901090
function prepareResultData(uri, result, options) {
10911091

@@ -1969,15 +1969,6 @@
19691969
}
19701970
}
19711971

1972-
if (/* options.getProviderOptions('readability.enabled') === true */
1973-
CONFIG.providerOptions && CONFIG.providerOptions.readability
1974-
&& CONFIG.providerOptions.readability.enabled === true
1975-
|| options.readability) {
1976-
context.__readabilityEnabled = true;
1977-
// Prevent force load readability plugin.
1978-
usedParams.__readabilityEnabled = true;
1979-
}
1980-
19811972
asyncMethodCb('initial');
19821973
};
19831974

lib/loader/utils.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
"request",
77
"whitelistRecord",
88
"iframelyRun",
9-
"__readabilityEnabled",
109

1110
// Copy from `core.js` `utilsModules`.
1211
"utils",

plugins/links/article/article.js

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,24 @@
1+
import * as cheerio from 'cheerio';
2+
13
export default {
24

3-
getData: function(readability, meta, __is_general_article, utils) {
5+
provides: 'articlebody', // if not yet provided from LD articlebody
6+
7+
getData: function(__readabilityEnabled, readability, meta, utils) {
8+
9+
const articleHtml = utils.encodeText(meta.charset, readability.getHTML());
10+
const $p = cheerio.load(articleHtml)('p');
11+
12+
if ($p.text()) {
13+
return {
14+
articlebody: articleHtml
15+
}
16+
}
17+
},
418

19+
getVars: function(articlebody) {
520
return {
6-
safe_html: utils.encodeText(meta.charset, readability.getHTML())
21+
articlebody: articlebody
722
};
823
}
924
};
Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,30 @@
11
export default {
22

3-
provides: '__is_general_article',
3+
provides: [
4+
"__readabilityEnabled",
5+
"articlebody"
6+
],
47

5-
getData: function(meta, __readabilityEnabled) {
8+
getData: function(meta, options) {
69

7-
if (meta.og && (meta.og.type === "article" || meta.og.type === "blog")) {
10+
const ld = meta.ld?.newsarticle || meta.ld?.article || meta.ld?.blogposting || meta.ld?.reportagenewsarticle || meta.ld?.socialmediaposting;
811

9-
return {
10-
__is_general_article: true
11-
};
12+
if ((ld
13+
|| (meta.og && (meta.og.type === "article" || meta.og.type === "blog" || meta.og.type === 'website')
14+
|| meta.twitter?.card === 'summary_large_image'
15+
|| meta.article))
16+
17+
&& (options.getRequestOptions('readability.articlebody', false) || CONFIG.providerOptions?.app?.allow_readability === true)) {
18+
19+
if (ld?.articlebody && /\/>/.test(ld.articlebody)) {
20+
return {
21+
articlebody: ld.articlebody
22+
}
23+
} else if (options.getProviderOptions('app.allow_readability')) {
24+
return {
25+
__readabilityEnabled: true
26+
}
27+
}
1228
}
1329
}
1430
};

plugins/links/article/reader.js

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
export default {
2+
3+
getData: function(articlebody) {
4+
if (CONFIG.providerOptions?.app?.allow_readability === true && !CONFIG.SKIP_IFRAMELY_RENDERS) {
5+
return {
6+
safe_html: articlebody
7+
}
8+
}
9+
}
10+
};

plugins/links/embedURL/embedURL.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ export default {
44

55
provides: 'schemaVideoObject',
66

7-
getData: function(url, cheerio, decode, __allowEmbedURL, utils) {
7+
getData: function(url, __allowEmbedURL, cheerio, decode, utils) {
88

99
/* Let's try to find ld+json in the body first. */
1010
const ldSelector = 'script[type="application/ld+json"]:contains("VideoObject"), script[type="application/ld+json"]:contains("VideoObject")'

plugins/meta/description-from-p-tag.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ export default {
55
lowestPriority: true,
66
provides: '__allowPTagDescription',
77

8-
getMeta: function(cheerio, decode, __allowPTagDescription) {
8+
getMeta: function(__allowPTagDescription, cheerio, decode) {
99
// Get the text from the first <p> tag that's not in a header
1010
var description;
1111
cheerio("body p").each(function() {

static/js/debug.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ function processUrl() {
385385
// Render context.
386386
var contexts = data.allData && data.allData
387387
.filter(function(d) {
388-
return d.method.name === 'getData';
388+
return d.method.name === 'getData' || d.method.name === 'getVars';
389389
})
390390
.map(function(d) {
391391
return d.data;

0 commit comments

Comments
 (0)