release: 2.2.0 (#496)

mtashley · web-flow · commit c5c000586d20 · 2019-09-10T09:51:14.000-07:00
* release: 2.2.0
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,36 @@
 # Mercury Parser Changelog
 
+### 2.2.0 (Sept 10, 2019)
+
+##### Commits
+
+- [[`e12c916499`](https://github.com/postlight/mercury-parser/commit/e12c916499)] - **feat**: ability to add custom extractors via api (#484) (Michael Ashley)
+- [[`f95947fe88`](https://github.com/postlight/mercury-parser/commit/f95947fe88)] - Implemented custom extractor epaper.zeit.de (#488) (Sven Wiegand)
+- [[`2422e4717d`](https://github.com/postlight/mercury-parser/commit/2422e4717d)] - **fix**: incorrect parsing on medium.com (#477) (Michael Ashley)
+- [[`2bed238b68`](https://github.com/postlight/mercury-parser/commit/2bed238b68)] - chore(package): update inquirer to version 7.0.0 (#479) (greenkeeper[bot])
+- [[`869e44a69f`](https://github.com/postlight/mercury-parser/commit/869e44a69f)] - chore(package): update karma-chrome-launcher to version 3.0.0 (#458) (greenkeeper[bot])
+- [[`e4a7a288e5`](https://github.com/postlight/mercury-parser/commit/e4a7a288e5)] - chore(package): update eslint-config-prettier to version 6.1.0 (#476) (greenkeeper[bot])
+- [[`2173c4cf83`](https://github.com/postlight/mercury-parser/commit/2173c4cf83)] - **deps**: Update wuzzy to fix vulnerability (#462) (Malo Bourgon)
+- [[`a918a9d6fa`](https://github.com/postlight/mercury-parser/commit/a918a9d6fa)] - **doc**: correct link that points to wrong line (#469) (Jakob Fix)
+- [[`0686ee7956`](https://github.com/postlight/mercury-parser/commit/0686ee7956)] - **fix**: incorrect parsing on theatlantic.com (#475) (Michael Ashley)
+- [[`5e33263d25`](https://github.com/postlight/mercury-parser/commit/5e33263d25)] - **chore**: minifying biorxiv.com fixture (#478) (Michael Ashley)
+- [[`911b0f87c8`](https://github.com/postlight/mercury-parser/commit/911b0f87c8)] - Add custom extractor for biorxiv.org (#467) (david0leong)
+- [[`76d59f2d58`](https://github.com/postlight/mercury-parser/commit/76d59f2d58)] - **doc**: correct internal page links (#470) (Jakob Fix)
+- [[`398cba4d66`](https://github.com/postlight/mercury-parser/commit/398cba4d66)] - chore(deps): bump lodash.merge from 4.6.1 to 4.6.2 (#456) (dependabot[bot])
+- [[`90e208ea13`](https://github.com/postlight/mercury-parser/commit/90e208ea13)] - chore(deps): bump cached-path-relative from 1.0.0 to 1.0.2 (#472) (dependabot[bot])
+- [[`5bb7c58e95`](https://github.com/postlight/mercury-parser/commit/5bb7c58e95)] - chore(deps): bump merge from 1.2.0 to 1.2.1 (#473) (dependabot[bot])
+- [[`ce572f3a28`](https://github.com/postlight/mercury-parser/commit/ce572f3a28)] - chore(package): update brfs-babel to version 2.0.0 (#461) (greenkeeper[bot])
+- [[`6f65702a6c`](https://github.com/postlight/mercury-parser/commit/6f65702a6c)] - Update moment-timezone to the latest version 🚀 (#455) (greenkeeper[bot])
+- [[`c764cebc0c`](https://github.com/postlight/mercury-parser/commit/c764cebc0c)] - chore(package): update remark-cli to version 7.0.0 (#460) (greenkeeper[bot])
+- [[`853e041d84`](https://github.com/postlight/mercury-parser/commit/853e041d84)] - **deps**: update husky to the latest version 🚀 (#450) (greenkeeper[bot])
+- [[`f42f81218b`](https://github.com/postlight/mercury-parser/commit/f42f81218b)] - **deps**: update iconv-lite to the latest version 🚀 (#447) (greenkeeper[bot])
+- [[`592f175270`](https://github.com/postlight/mercury-parser/commit/592f175270)] - **tests**: remove a duplicate test (#448) (Kirill Danshin)
+
 ### 2.1.1 (Jun 26, 2019)
 
 ##### Commits
 
+- [[`713de25751`](https://github.com/postlight/mercury-parser/commit/713de25751)] - **release**: 2.1.1 (#446) (Adam Pash)
 - [[`c11b85f405`](https://github.com/postlight/mercury-parser/commit/c11b85f405)] - **deps**: update eslint-config-prettier to version 5.0.0 (#441) (greenkeeper[bot])
 - [[`3b0d5fed69`](https://github.com/postlight/mercury-parser/commit/3b0d5fed69)] - **chore**: prevent adding phantomjs-prebuilt as a dependency in CI. (#412) (Jaen)
 - [[`939d181951`](https://github.com/postlight/mercury-parser/commit/939d181951)] - **fix**: support query strings in lazy-loaded srcsets (#387) (Toufic Mouallem)
diff --git a/dist/mercury.js b/dist/mercury.js
@@ -21,6 +21,7 @@ var _parseFloat = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-
 var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
 var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
 var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
+var _Object$assign = _interopDefault(require('@babel/runtime-corejs2/core-js/object/assign'));
 var _Object$keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
 var stringDirection = _interopDefault(require('string-direction'));
 var validUrl = _interopDefault(require('valid-url'));
@@ -1744,6 +1745,20 @@ function mergeSupportedDomains(extractor) {
   return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray(extractor.supportedDomains))) : merge(extractor, [extractor.domain]);
 }
 
+var apiExtractors = {};
+function addExtractor(extractor) {
+  if (!extractor || !extractor.domain) {
+    return {
+      error: true,
+      message: 'Unable to add custom extractor. Invalid parameters.'
+    };
+  }
+
+  _Object$assign(apiExtractors, mergeSupportedDomains(extractor));
+
+  return apiExtractors;
+}
+
 var BloggerExtractor = {
   domain: 'blogspot.com',
   content: {
@@ -1906,25 +1921,30 @@ var NYTimesExtractor = {
 var TheAtlanticExtractor = {
   domain: 'www.theatlantic.com',
   title: {
-    selectors: ['h1.hed']
+    selectors: ['h1', '.c-article-header__hed']
   },
   author: {
-    selectors: ['article#article .article-cover-extra .metadata .byline a']
+    selectors: [['meta[name="author"]', 'value'], '.c-byline__author']
   },
   content: {
-    selectors: [['.article-cover figure.lead-img', '.article-body'], '.article-body'],
+    selectors: ['article', '.article-body'],
     // Is there anything in the content you selected that needs transformed
     // before it's consumable content? E.g., unusual lazy loaded images
     transforms: [],
     // Is there anything that is in the result that shouldn't be?
     // The clean selectors will remove anything that matches from
     // the result
-    clean: ['.partner-box', '.callout']
+    clean: ['.partner-box', '.callout', '.c-article-writer__image', '.c-article-writer__content', '.c-letters-cta__text', '.c-footer__logo', '.c-recirculation-link', '.twitter-tweet']
+  },
+  dek: {
+    selectors: [['meta[name="description"]', 'value']]
   },
   date_published: {
-    selectors: [['time[itemProp="datePublished"]', 'datetime']]
+    selectors: [['time[itemprop="datePublished"]', 'datetime']]
+  },
+  lead_image_url: {
+    selectors: [['img[itemprop="url"]', 'src']]
   },
-  lead_image_url: null,
   next_page_url: null,
   excerpt: null
 };
@@ -2347,22 +2367,22 @@ var ApartmentTherapyExtractor = {
 
 var MediumExtractor = {
   domain: 'medium.com',
-  supportedDomains: ['trackchanges.postlight.com'],
   title: {
-    selectors: ['h1']
+    selectors: ['h1', ['meta[name="og:title"]', 'value']]
   },
   author: {
     selectors: [['meta[name="author"]', 'value']]
   },
   content: {
-    selectors: [['.section-content'], '.section-content', 'article > div > section'],
+    selectors: ['article'],
     // Is there anything in the content you selected that needs transformed
     // before it's consumable content? E.g., unusual lazy loaded images
     transforms: {
       // Re-write lazy-loaded youtube videos
       iframe: function iframe($node) {
         var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
         var thumb = decodeURIComponent($node.attr('data-thumbnail'));
+        var $parent = $node.parents('figure');
 
         if (ytRe.test(thumb)) {
           var _thumb$match = thumb.match(ytRe),
@@ -2372,10 +2392,13 @@ var MediumExtractor = {
 
 
           $node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
-          var $parent = $node.parents('figure');
           var $caption = $parent.find('figcaption');
           $parent.empty().append([$node, $caption]);
-        }
+          return;
+        } // If we can't draw the YouTube preview, remove the figure.
+
+
+        $parent.remove();
       },
       // rewrite figures to pull out image and caption, remove rest
       figure: function figure($node) {
@@ -2384,23 +2407,27 @@ var MediumExtractor = {
         var $img = $node.find('img').slice(-1)[0];
         var $caption = $node.find('figcaption');
         $node.empty().append([$img, $caption]);
+      },
+      // Remove any smaller images that did not get caught by the generic image
+      // cleaner (author photo 48px, leading sentence images 79px, etc.).
+      img: function img($node) {
+        var width = _parseInt($node.attr('width'), 10);
+
+        if (width < 100) $node.remove();
       }
     },
     // Is there anything that is in the result that shouldn't be?
     // The clean selectors will remove anything that matches from
     // the result
-    clean: []
+    clean: ['span', 'svg']
   },
   date_published: {
-    selectors: [['time[datetime]', 'datetime']]
+    selectors: [['meta[name="article:published_time"]', 'value']]
   },
   lead_image_url: {
     selectors: [['meta[name="og:image"]', 'value']]
   },
-  dek: {
-    selectors: [// enter selectors
-    ]
-  },
+  dek: null,
   next_page_url: {
     selectors: [// enter selectors
     ]
@@ -5690,6 +5717,56 @@ var PitchforkComExtractor = {
   }
 };
 
+var BiorxivOrgExtractor = {
+  domain: 'biorxiv.org',
+  title: {
+    selectors: ['h1#page-title']
+  },
+  author: {
+    selectors: ['div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors']
+  },
+  content: {
+    selectors: ['div#abstract-1'],
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: []
+  }
+};
+
+var EpaperZeitDeExtractor = {
+  domain: 'epaper.zeit.de',
+  title: {
+    selectors: ['p.title']
+  },
+  author: {
+    selectors: ['.article__author']
+  },
+  date_published: null,
+  excerpt: {
+    selectors: ['subtitle']
+  },
+  lead_image_url: null,
+  content: {
+    selectors: ['.article'],
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {
+      'p.title': 'h1',
+      '.article__author': 'p',
+      byline: 'p',
+      linkbox: 'p'
+    },
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['image-credits', 'box[type=citation]']
+  }
+};
+
 
 
 var CustomExtractors = /*#__PURE__*/Object.freeze({
@@ -5824,7 +5901,9 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
   WwwRbbtodayComExtractor: WwwRbbtodayComExtractor,
   WwwLemondeFrExtractor: WwwLemondeFrExtractor,
   WwwPhoronixComExtractor: WwwPhoronixComExtractor,
-  PitchforkComExtractor: PitchforkComExtractor
+  PitchforkComExtractor: PitchforkComExtractor,
+  BiorxivOrgExtractor: BiorxivOrgExtractor,
+  EpaperZeitDeExtractor: EpaperZeitDeExtractor
 });
 
 var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
@@ -7152,7 +7231,7 @@ function getExtractor(url, parsedUrl, $) {
   var _parsedUrl = parsedUrl,
       hostname = _parsedUrl.hostname;
   var baseDomain = hostname.split('.').slice(-2).join('.');
-  return Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
+  return apiExtractors[hostname] || apiExtractors[baseDomain] || Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
 }
 
 function cleanBySelectors($content, $, _ref) {
@@ -7529,6 +7608,7 @@ var Mercury = {
           _opts$headers,
           headers,
           extend,
+          customExtractor,
           parsedUrl,
           $,
           Extractor,
@@ -7546,7 +7626,7 @@ var Mercury = {
           switch (_context.prev = _context.next) {
             case 0:
               _ref = _args.length > 1 && _args[1] !== undefined ? _args[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, ["html"]);
-              _opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend; // if no url was passed and this is the browser version,
+              _opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend, customExtractor = opts.customExtractor; // if no url was passed and this is the browser version,
               // set url to window.location.href and load the html
               // from the current page
 
@@ -7583,6 +7663,11 @@ var Mercury = {
               return _context.abrupt("return", $);
 
             case 11:
+              // Add custom extractor via cli.
+              if (customExtractor) {
+                addExtractor(customExtractor);
+              }
+
               Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`);
               // if html still has not been set (i.e., url passed to Mercury.parse),
               // set html from the response of Resource.create
@@ -7618,11 +7703,11 @@ var Mercury = {
               _result = result, title = _result.title, next_page_url = _result.next_page_url; // Fetch more pages if next_page_url found
 
               if (!(fetchAllPages && next_page_url)) {
-                _context.next = 24;
+                _context.next = 25;
                 break;
               }
 
-              _context.next = 21;
+              _context.next = 22;
               return collectAllPages({
                 Extractor: Extractor,
                 next_page_url: next_page_url,
@@ -7634,18 +7719,18 @@ var Mercury = {
                 url: url
               });
 
-            case 21:
+            case 22:
               result = _context.sent;
-              _context.next = 25;
+              _context.next = 26;
               break;
 
-            case 24:
+            case 25:
               result = _objectSpread({}, result, {
                 total_pages: 1,
                 rendered_pages: 1
               });
 
-            case 25:
+            case 26:
               if (contentType === 'markdown') {
                 turndownService = new TurndownService();
                 result.content = turndownService.turndown(result.content);
@@ -7655,7 +7740,7 @@ var Mercury = {
 
               return _context.abrupt("return", _objectSpread({}, result, extendedTypes));
 
-            case 27:
+            case 28:
             case "end":
               return _context.stop();
           }
@@ -7674,6 +7759,9 @@ var Mercury = {
   // to work with, e.g., for custom extractor generator
   fetchResource: function fetchResource(url) {
     return Resource.create(url);
+  },
+  addExtractor: function addExtractor$$1(extractor) {
+    return addExtractor(extractor);
   }
 };
 
diff --git a/dist/mercury.web.js b/dist/mercury.web.js
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@postlight/mercury-parser",
-  "version": "2.1.1",
+  "version": "2.2.0",
   "description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
   "author": "Postlight <mercury@postlight.com>",
   "homepage": "https://mercury.postlight.com",

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@postlight/mercury-parser",`
`3`		`- "version": "2.1.1",`
	`3`	`+ "version": "2.2.0",`
`4`	`4`	`"description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",`
`5`	`5`	`"author": "Postlight <[email protected]>",`
`6`	`6`	`"homepage": "https://mercury.postlight.com",`