Skip to content

Commit b0e708a

Browse files
feat: update nytimes extractor (#506)
* feat: update custom extractor for nytimes.com
1 parent 3fb8526 commit b0e708a

File tree

8 files changed

+1170
-87
lines changed

8 files changed

+1170
-87
lines changed

fixtures/www.nytimes.com/1474061823854.html

Lines changed: 0 additions & 2 deletions
This file was deleted.

fixtures/www.nytimes.com/1474318141888.html

Lines changed: 0 additions & 1 deletion
This file was deleted.

fixtures/www.nytimes.com/1539194812689.html

Lines changed: 0 additions & 69 deletions
This file was deleted.

fixtures/www.nytimes.com/1571223287888.html

Lines changed: 1016 additions & 0 deletions
Large diffs are not rendered by default.

fixtures/www.nytimes.com/1571223477873.html

Lines changed: 75 additions & 0 deletions
Large diffs are not rendered by default.

fixtures/www.nytimes.com/1571224616991.html

Lines changed: 50 additions & 0 deletions
Large diffs are not rendered by default.

src/extractors/custom/www.nytimes.com/index.js

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,25 @@ export const NYTimesExtractor = {
22
domain: 'www.nytimes.com',
33

44
title: {
5-
selectors: ['h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline'],
5+
selectors: [
6+
'h1.g-headline',
7+
'h1[itemprop="headline"]',
8+
'h1.headline',
9+
'h1 .balancedHeadline',
10+
],
611
},
712

813
author: {
9-
selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline'],
14+
selectors: [
15+
['meta[name="author"]', 'value'],
16+
'.g-byline',
17+
'.byline',
18+
['meta[name="byl"]', 'value'],
19+
],
1020
},
1121

1222
content: {
13-
selectors: ['div.g-blocks', 'article#story'],
23+
selectors: ['div.g-blocks', 'section[name="articleBody"]', 'article#story'],
1424

1525
transforms: {
1626
'img.g-lazy': $node => {

src/extractors/custom/www.nytimes.com/index.test.js

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ describe('NYTimesExtractor', () => {
1414
let url;
1515
beforeAll(() => {
1616
url =
17-
'http://www.nytimes.com/interactive/2016/09/15/arts/design/national-museum-of-african-american-history-and-culture.html';
17+
'https://www.nytimes.com/2016/09/20/nyregion/nyc-nj-explosions-ahmad-khan-rahami.html';
1818
const html = fs.readFileSync(
19-
'./fixtures/www.nytimes.com/1474318141888.html'
19+
'./fixtures/www.nytimes.com/1571224616991.html'
2020
);
2121
result = Mercury.parse(url, { html, fallback: false });
2222
});
@@ -76,7 +76,7 @@ describe('NYTimesExtractor', () => {
7676
// the article.
7777
assert.equal(
7878
lead_image_url,
79-
'https://static01.nyt.com/images/2016/09/20/nyregion/20MANHUNT1/20MANHUNT1-facebookJumbo.jpg'
79+
'https://static01.nyt.com/images/2016/09/20/nyregion/Manhunt/Manhunt-facebookJumbo-v2.jpg'
8080
);
8181
});
8282

@@ -100,34 +100,38 @@ describe('NYTimesExtractor', () => {
100100
// the article.
101101
assert.equal(
102102
first13,
103-
'The man believed to be responsible for the explosion in Manhattan on Saturday'
103+
'The man who the police said sowed terror across two states, setting off'
104104
);
105105
});
106106
});
107107

108108
it('works with a feature story', async () => {
109109
const html = fs.readFileSync(
110-
'./fixtures/www.nytimes.com/1474061823854.html'
110+
'./fixtures/www.nytimes.com/1571223287888.html'
111111
);
112112
const uri =
113113
'http://www.nytimes.com/interactive/2016/09/15/arts/design/national-museum-of-african-american-history-and-culture.html';
114114

115115
const { content, title, author } = await Mercury.parse(uri, { html });
116116
const $ = cheerio.load(content);
117-
const text = $('*')
118-
.first()
119-
.text()
120-
.trim()
121-
.slice(0, 20);
117+
const text = excerptContent(
118+
$('*')
119+
.first()
120+
.text(),
121+
13
122+
);
122123

123124
assert.equal(title, 'I, Too, Sing America');
124125
assert.equal(author, 'The New York Times');
125-
assert.equal(text, 'T he Smithsonian’s N');
126+
assert.equal(
127+
text,
128+
'T he Smithsonian’s National Museum of African American History and Culture opens on'
129+
);
126130
});
127131

128132
it('returns the title on most recent articles', async () => {
129133
const html = fs.readFileSync(
130-
'./fixtures/www.nytimes.com/1539194812689.html'
134+
'./fixtures/www.nytimes.com/1571223477873.html'
131135
);
132136
const uri =
133137
'https://www.nytimes.com/2018/10/09/us/politics/nikki-haley-united-nations.html';

0 commit comments

Comments
 (0)