Skip to content

Commit c5c0005

Browse files
authored
release: 2.2.0 (#496)
* release: 2.2.0
1 parent e12c916 commit c5c0005

File tree

4 files changed

+144
-29
lines changed

4 files changed

+144
-29
lines changed

CHANGELOG.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,36 @@
11
# Mercury Parser Changelog
22

3+
### 2.2.0 (Sept 10, 2019)
4+
5+
##### Commits
6+
7+
- [[`e12c916499`](https://github.com/postlight/mercury-parser/commit/e12c916499)] - **feat**: ability to add custom extractors via api (#484) (Michael Ashley)
8+
- [[`f95947fe88`](https://github.com/postlight/mercury-parser/commit/f95947fe88)] - Implemented custom extractor epaper.zeit.de (#488) (Sven Wiegand)
9+
- [[`2422e4717d`](https://github.com/postlight/mercury-parser/commit/2422e4717d)] - **fix**: incorrect parsing on medium.com (#477) (Michael Ashley)
10+
- [[`2bed238b68`](https://github.com/postlight/mercury-parser/commit/2bed238b68)] - chore(package): update inquirer to version 7.0.0 (#479) (greenkeeper[bot])
11+
- [[`869e44a69f`](https://github.com/postlight/mercury-parser/commit/869e44a69f)] - chore(package): update karma-chrome-launcher to version 3.0.0 (#458) (greenkeeper[bot])
12+
- [[`e4a7a288e5`](https://github.com/postlight/mercury-parser/commit/e4a7a288e5)] - chore(package): update eslint-config-prettier to version 6.1.0 (#476) (greenkeeper[bot])
13+
- [[`2173c4cf83`](https://github.com/postlight/mercury-parser/commit/2173c4cf83)] - **deps**: Update wuzzy to fix vulnerability (#462) (Malo Bourgon)
14+
- [[`a918a9d6fa`](https://github.com/postlight/mercury-parser/commit/a918a9d6fa)] - **doc**: correct link that points to wrong line (#469) (Jakob Fix)
15+
- [[`0686ee7956`](https://github.com/postlight/mercury-parser/commit/0686ee7956)] - **fix**: incorrect parsing on theatlantic.com (#475) (Michael Ashley)
16+
- [[`5e33263d25`](https://github.com/postlight/mercury-parser/commit/5e33263d25)] - **chore**: minifying biorxiv.com fixture (#478) (Michael Ashley)
17+
- [[`911b0f87c8`](https://github.com/postlight/mercury-parser/commit/911b0f87c8)] - Add custom extractor for biorxiv.org (#467) (david0leong)
18+
- [[`76d59f2d58`](https://github.com/postlight/mercury-parser/commit/76d59f2d58)] - **doc**: correct internal page links (#470) (Jakob Fix)
19+
- [[`398cba4d66`](https://github.com/postlight/mercury-parser/commit/398cba4d66)] - chore(deps): bump lodash.merge from 4.6.1 to 4.6.2 (#456) (dependabot[bot])
20+
- [[`90e208ea13`](https://github.com/postlight/mercury-parser/commit/90e208ea13)] - chore(deps): bump cached-path-relative from 1.0.0 to 1.0.2 (#472) (dependabot[bot])
21+
- [[`5bb7c58e95`](https://github.com/postlight/mercury-parser/commit/5bb7c58e95)] - chore(deps): bump merge from 1.2.0 to 1.2.1 (#473) (dependabot[bot])
22+
- [[`ce572f3a28`](https://github.com/postlight/mercury-parser/commit/ce572f3a28)] - chore(package): update brfs-babel to version 2.0.0 (#461) (greenkeeper[bot])
23+
- [[`6f65702a6c`](https://github.com/postlight/mercury-parser/commit/6f65702a6c)] - Update moment-timezone to the latest version 🚀 (#455) (greenkeeper[bot])
24+
- [[`c764cebc0c`](https://github.com/postlight/mercury-parser/commit/c764cebc0c)] - chore(package): update remark-cli to version 7.0.0 (#460) (greenkeeper[bot])
25+
- [[`853e041d84`](https://github.com/postlight/mercury-parser/commit/853e041d84)] - **deps**: update husky to the latest version 🚀 (#450) (greenkeeper[bot])
26+
- [[`f42f81218b`](https://github.com/postlight/mercury-parser/commit/f42f81218b)] - **deps**: update iconv-lite to the latest version 🚀 (#447) (greenkeeper[bot])
27+
- [[`592f175270`](https://github.com/postlight/mercury-parser/commit/592f175270)] - **tests**: remove a duplicate test (#448) (Kirill Danshin)
28+
329
### 2.1.1 (Jun 26, 2019)
430

531
##### Commits
632

33+
- [[`713de25751`](https://github.com/postlight/mercury-parser/commit/713de25751)] - **release**: 2.1.1 (#446) (Adam Pash)
734
- [[`c11b85f405`](https://github.com/postlight/mercury-parser/commit/c11b85f405)] - **deps**: update eslint-config-prettier to version 5.0.0 (#441) (greenkeeper[bot])
835
- [[`3b0d5fed69`](https://github.com/postlight/mercury-parser/commit/3b0d5fed69)] - **chore**: prevent adding phantomjs-prebuilt as a dependency in CI. (#412) (Jaen)
936
- [[`939d181951`](https://github.com/postlight/mercury-parser/commit/939d181951)] - **fix**: support query strings in lazy-loaded srcsets (#387) (Toufic Mouallem)

dist/mercury.js

Lines changed: 115 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ var _parseFloat = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-
2121
var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
2222
var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
2323
var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
24+
var _Object$assign = _interopDefault(require('@babel/runtime-corejs2/core-js/object/assign'));
2425
var _Object$keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
2526
var stringDirection = _interopDefault(require('string-direction'));
2627
var validUrl = _interopDefault(require('valid-url'));
@@ -1744,6 +1745,20 @@ function mergeSupportedDomains(extractor) {
17441745
return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray(extractor.supportedDomains))) : merge(extractor, [extractor.domain]);
17451746
}
17461747

1748+
var apiExtractors = {};
1749+
function addExtractor(extractor) {
1750+
if (!extractor || !extractor.domain) {
1751+
return {
1752+
error: true,
1753+
message: 'Unable to add custom extractor. Invalid parameters.'
1754+
};
1755+
}
1756+
1757+
_Object$assign(apiExtractors, mergeSupportedDomains(extractor));
1758+
1759+
return apiExtractors;
1760+
}
1761+
17471762
var BloggerExtractor = {
17481763
domain: 'blogspot.com',
17491764
content: {
@@ -1906,25 +1921,30 @@ var NYTimesExtractor = {
19061921
var TheAtlanticExtractor = {
19071922
domain: 'www.theatlantic.com',
19081923
title: {
1909-
selectors: ['h1.hed']
1924+
selectors: ['h1', '.c-article-header__hed']
19101925
},
19111926
author: {
1912-
selectors: ['article#article .article-cover-extra .metadata .byline a']
1927+
selectors: [['meta[name="author"]', 'value'], '.c-byline__author']
19131928
},
19141929
content: {
1915-
selectors: [['.article-cover figure.lead-img', '.article-body'], '.article-body'],
1930+
selectors: ['article', '.article-body'],
19161931
// Is there anything in the content you selected that needs transformed
19171932
// before it's consumable content? E.g., unusual lazy loaded images
19181933
transforms: [],
19191934
// Is there anything that is in the result that shouldn't be?
19201935
// The clean selectors will remove anything that matches from
19211936
// the result
1922-
clean: ['.partner-box', '.callout']
1937+
clean: ['.partner-box', '.callout', '.c-article-writer__image', '.c-article-writer__content', '.c-letters-cta__text', '.c-footer__logo', '.c-recirculation-link', '.twitter-tweet']
1938+
},
1939+
dek: {
1940+
selectors: [['meta[name="description"]', 'value']]
19231941
},
19241942
date_published: {
1925-
selectors: [['time[itemProp="datePublished"]', 'datetime']]
1943+
selectors: [['time[itemprop="datePublished"]', 'datetime']]
1944+
},
1945+
lead_image_url: {
1946+
selectors: [['img[itemprop="url"]', 'src']]
19261947
},
1927-
lead_image_url: null,
19281948
next_page_url: null,
19291949
excerpt: null
19301950
};
@@ -2347,22 +2367,22 @@ var ApartmentTherapyExtractor = {
23472367

23482368
var MediumExtractor = {
23492369
domain: 'medium.com',
2350-
supportedDomains: ['trackchanges.postlight.com'],
23512370
title: {
2352-
selectors: ['h1']
2371+
selectors: ['h1', ['meta[name="og:title"]', 'value']]
23532372
},
23542373
author: {
23552374
selectors: [['meta[name="author"]', 'value']]
23562375
},
23572376
content: {
2358-
selectors: [['.section-content'], '.section-content', 'article > div > section'],
2377+
selectors: ['article'],
23592378
// Is there anything in the content you selected that needs transformed
23602379
// before it's consumable content? E.g., unusual lazy loaded images
23612380
transforms: {
23622381
// Re-write lazy-loaded youtube videos
23632382
iframe: function iframe($node) {
23642383
var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
23652384
var thumb = decodeURIComponent($node.attr('data-thumbnail'));
2385+
var $parent = $node.parents('figure');
23662386

23672387
if (ytRe.test(thumb)) {
23682388
var _thumb$match = thumb.match(ytRe),
@@ -2372,10 +2392,13 @@ var MediumExtractor = {
23722392

23732393

23742394
$node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
2375-
var $parent = $node.parents('figure');
23762395
var $caption = $parent.find('figcaption');
23772396
$parent.empty().append([$node, $caption]);
2378-
}
2397+
return;
2398+
} // If we can't draw the YouTube preview, remove the figure.
2399+
2400+
2401+
$parent.remove();
23792402
},
23802403
// rewrite figures to pull out image and caption, remove rest
23812404
figure: function figure($node) {
@@ -2384,23 +2407,27 @@ var MediumExtractor = {
23842407
var $img = $node.find('img').slice(-1)[0];
23852408
var $caption = $node.find('figcaption');
23862409
$node.empty().append([$img, $caption]);
2410+
},
2411+
// Remove any smaller images that did not get caught by the generic image
2412+
// cleaner (author photo 48px, leading sentence images 79px, etc.).
2413+
img: function img($node) {
2414+
var width = _parseInt($node.attr('width'), 10);
2415+
2416+
if (width < 100) $node.remove();
23872417
}
23882418
},
23892419
// Is there anything that is in the result that shouldn't be?
23902420
// The clean selectors will remove anything that matches from
23912421
// the result
2392-
clean: []
2422+
clean: ['span', 'svg']
23932423
},
23942424
date_published: {
2395-
selectors: [['time[datetime]', 'datetime']]
2425+
selectors: [['meta[name="article:published_time"]', 'value']]
23962426
},
23972427
lead_image_url: {
23982428
selectors: [['meta[name="og:image"]', 'value']]
23992429
},
2400-
dek: {
2401-
selectors: [// enter selectors
2402-
]
2403-
},
2430+
dek: null,
24042431
next_page_url: {
24052432
selectors: [// enter selectors
24062433
]
@@ -5690,6 +5717,56 @@ var PitchforkComExtractor = {
56905717
}
56915718
};
56925719

5720+
var BiorxivOrgExtractor = {
5721+
domain: 'biorxiv.org',
5722+
title: {
5723+
selectors: ['h1#page-title']
5724+
},
5725+
author: {
5726+
selectors: ['div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors']
5727+
},
5728+
content: {
5729+
selectors: ['div#abstract-1'],
5730+
// Is there anything in the content you selected that needs transformed
5731+
// before it's consumable content? E.g., unusual lazy loaded images
5732+
transforms: {},
5733+
// Is there anything that is in the result that shouldn't be?
5734+
// The clean selectors will remove anything that matches from
5735+
// the result
5736+
clean: []
5737+
}
5738+
};
5739+
5740+
var EpaperZeitDeExtractor = {
5741+
domain: 'epaper.zeit.de',
5742+
title: {
5743+
selectors: ['p.title']
5744+
},
5745+
author: {
5746+
selectors: ['.article__author']
5747+
},
5748+
date_published: null,
5749+
excerpt: {
5750+
selectors: ['subtitle']
5751+
},
5752+
lead_image_url: null,
5753+
content: {
5754+
selectors: ['.article'],
5755+
// Is there anything in the content you selected that needs transformed
5756+
// before it's consumable content? E.g., unusual lazy loaded images
5757+
transforms: {
5758+
'p.title': 'h1',
5759+
'.article__author': 'p',
5760+
byline: 'p',
5761+
linkbox: 'p'
5762+
},
5763+
// Is there anything that is in the result that shouldn't be?
5764+
// The clean selectors will remove anything that matches from
5765+
// the result
5766+
clean: ['image-credits', 'box[type=citation]']
5767+
}
5768+
};
5769+
56935770

56945771

56955772
var CustomExtractors = /*#__PURE__*/Object.freeze({
@@ -5824,7 +5901,9 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
58245901
WwwRbbtodayComExtractor: WwwRbbtodayComExtractor,
58255902
WwwLemondeFrExtractor: WwwLemondeFrExtractor,
58265903
WwwPhoronixComExtractor: WwwPhoronixComExtractor,
5827-
PitchforkComExtractor: PitchforkComExtractor
5904+
PitchforkComExtractor: PitchforkComExtractor,
5905+
BiorxivOrgExtractor: BiorxivOrgExtractor,
5906+
EpaperZeitDeExtractor: EpaperZeitDeExtractor
58285907
});
58295908

58305909
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
@@ -7152,7 +7231,7 @@ function getExtractor(url, parsedUrl, $) {
71527231
var _parsedUrl = parsedUrl,
71537232
hostname = _parsedUrl.hostname;
71547233
var baseDomain = hostname.split('.').slice(-2).join('.');
7155-
return Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
7234+
return apiExtractors[hostname] || apiExtractors[baseDomain] || Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
71567235
}
71577236

71587237
function cleanBySelectors($content, $, _ref) {
@@ -7529,6 +7608,7 @@ var Mercury = {
75297608
_opts$headers,
75307609
headers,
75317610
extend,
7611+
customExtractor,
75327612
parsedUrl,
75337613
$,
75347614
Extractor,
@@ -7546,7 +7626,7 @@ var Mercury = {
75467626
switch (_context.prev = _context.next) {
75477627
case 0:
75487628
_ref = _args.length > 1 && _args[1] !== undefined ? _args[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, ["html"]);
7549-
_opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend; // if no url was passed and this is the browser version,
7629+
_opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend, customExtractor = opts.customExtractor; // if no url was passed and this is the browser version,
75507630
// set url to window.location.href and load the html
75517631
// from the current page
75527632

@@ -7583,6 +7663,11 @@ var Mercury = {
75837663
return _context.abrupt("return", $);
75847664

75857665
case 11:
7666+
// Add custom extractor via cli.
7667+
if (customExtractor) {
7668+
addExtractor(customExtractor);
7669+
}
7670+
75867671
Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
75877672
// if html still has not been set (i.e., url passed to Mercury.parse),
75887673
// set html from the response of Resource.create
@@ -7618,11 +7703,11 @@ var Mercury = {
76187703
_result = result, title = _result.title, next_page_url = _result.next_page_url; // Fetch more pages if next_page_url found
76197704

76207705
if (!(fetchAllPages && next_page_url)) {
7621-
_context.next = 24;
7706+
_context.next = 25;
76227707
break;
76237708
}
76247709

7625-
_context.next = 21;
7710+
_context.next = 22;
76267711
return collectAllPages({
76277712
Extractor: Extractor,
76287713
next_page_url: next_page_url,
@@ -7634,18 +7719,18 @@ var Mercury = {
76347719
url: url
76357720
});
76367721

7637-
case 21:
7722+
case 22:
76387723
result = _context.sent;
7639-
_context.next = 25;
7724+
_context.next = 26;
76407725
break;
76417726

7642-
case 24:
7727+
case 25:
76437728
result = _objectSpread({}, result, {
76447729
total_pages: 1,
76457730
rendered_pages: 1
76467731
});
76477732

7648-
case 25:
7733+
case 26:
76497734
if (contentType === 'markdown') {
76507735
turndownService = new TurndownService();
76517736
result.content = turndownService.turndown(result.content);
@@ -7655,7 +7740,7 @@ var Mercury = {
76557740

76567741
return _context.abrupt("return", _objectSpread({}, result, extendedTypes));
76577742

7658-
case 27:
7743+
case 28:
76597744
case "end":
76607745
return _context.stop();
76617746
}
@@ -7674,6 +7759,9 @@ var Mercury = {
76747759
// to work with, e.g., for custom extractor generator
76757760
fetchResource: function fetchResource(url) {
76767761
return Resource.create(url);
7762+
},
7763+
addExtractor: function addExtractor$$1(extractor) {
7764+
return addExtractor(extractor);
76777765
}
76787766
};
76797767

dist/mercury.web.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@postlight/mercury-parser",
3-
"version": "2.1.1",
3+
"version": "2.2.0",
44
"description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
55
"author": "Postlight <[email protected]>",
66
"homepage": "https://mercury.postlight.com",

0 commit comments

Comments
 (0)