Skip to content

Commit 7b68bcd

Browse files
authored
feat: remove obsolete custom extractors (postlight#712)
1 parent 4981355 commit 7b68bcd

File tree

12 files changed

+67
-581
lines changed

12 files changed

+67
-581
lines changed

dist/mercury.js

Lines changed: 66 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -2227,41 +2227,29 @@ var LittleThingsExtractor = {
22272227
excerpt: null
22282228
};
22292229

2230-
// Rename CustomExtractor
2231-
// to fit your publication
2232-
// (e.g., NYTimesExtractor)
22332230
var PoliticoExtractor = {
22342231
domain: 'www.politico.com',
22352232
title: {
2236-
selectors: [// enter title selectors
2237-
['meta[name="og:title"]', 'value']]
2233+
selectors: [['meta[name="og:title"]', 'value']]
22382234
},
22392235
author: {
2240-
selectors: ['.story-main-content .byline .vcard']
2236+
selectors: [['div[itemprop="author"] meta[itemprop="name"]', 'value'], '.story-meta__authors .vcard', '.story-main-content .byline .vcard']
22412237
},
22422238
content: {
2243-
selectors: [// enter content selectors
2244-
'.story-main-content', '.content-group', '.story-core', '.story-text'],
2245-
// Is there anything in the content you selected that needs transformed
2246-
// before it's consumable content? E.g., unusual lazy loaded images
2239+
selectors: [['.story-text'], '.story-main-content', '.story-core'],
22472240
transforms: [],
2248-
// Is there anything that is in the result that shouldn't be?
2249-
// The clean selectors will remove anything that matches from
2250-
// the result
2251-
clean: ['figcaption']
2241+
clean: ['figcaption', '.story-meta', '.ad']
22522242
},
22532243
date_published: {
2254-
selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']]
2244+
selectors: [['time[itemprop="datePublished"]', 'datetime'], ['.story-meta__details time[datetime]', 'datetime'], ['.story-main-content .timestamp time[datetime]', 'datetime']],
2245+
timezone: 'America/New_York'
22552246
},
22562247
lead_image_url: {
2257-
selectors: [// enter lead_image_url selectors
2258-
['meta[name="og:image"]', 'value']]
2248+
selectors: [['meta[name="og:image"]', 'value']]
22592249
},
22602250
dek: {
2261-
selectors: []
2262-
},
2263-
next_page_url: null,
2264-
excerpt: null
2251+
selectors: [['meta[name="og:description"]', 'value']]
2252+
}
22652253
};
22662254

22672255
var DeadspinExtractor = {
@@ -3980,33 +3968,6 @@ var WwwCnetComExtractor = {
39803968
}
39813969
};
39823970

3983-
var WwwCinemablendComExtractor = {
3984-
domain: 'www.cinemablend.com',
3985-
title: {
3986-
selectors: ['.story_title']
3987-
},
3988-
author: {
3989-
selectors: ['.author']
3990-
},
3991-
date_published: {
3992-
selectors: [['meta[name="article:published_time"]', 'value']],
3993-
timezone: 'EST'
3994-
},
3995-
lead_image_url: {
3996-
selectors: [['meta[name="og:image"]', 'value']]
3997-
},
3998-
content: {
3999-
selectors: ['div#wrap_left_content'],
4000-
// Is there anything in the content you selected that needs transformed
4001-
// before it's consumable content? E.g., unusual lazy loaded images
4002-
transforms: {},
4003-
// Is there anything that is in the result that shouldn't be?
4004-
// The clean selectors will remove anything that matches from
4005-
// the result
4006-
clean: []
4007-
}
4008-
};
4009-
40103971
var WwwTodayComExtractor = {
40113972
domain: 'www.today.com',
40123973
title: {
@@ -4033,33 +3994,6 @@ var WwwTodayComExtractor = {
40333994
}
40343995
};
40353996

4036-
var WwwHowtogeekComExtractor = {
4037-
domain: 'www.howtogeek.com',
4038-
title: {
4039-
selectors: ['title']
4040-
},
4041-
author: {
4042-
selectors: ['#authorinfobox a']
4043-
},
4044-
date_published: {
4045-
selectors: ['#authorinfobox + div li'],
4046-
timezone: 'GMT'
4047-
},
4048-
lead_image_url: {
4049-
selectors: [['meta[name="og:image"]', 'value']]
4050-
},
4051-
content: {
4052-
selectors: ['.thecontent'],
4053-
// Is there anything in the content you selected that needs transformed
4054-
// before it's consumable content? E.g., unusual lazy loaded images
4055-
transforms: {},
4056-
// Is there anything that is in the result that shouldn't be?
4057-
// The clean selectors will remove anything that matches from
4058-
// the result
4059-
clean: []
4060-
}
4061-
};
4062-
40633997
var WwwAlComExtractor = {
40643998
domain: 'www.al.com',
40653999
title: {
@@ -4286,33 +4220,6 @@ var ThoughtcatalogComExtractor = {
42864220
}
42874221
};
42884222

4289-
var WwwNjComExtractor = {
4290-
domain: 'www.nj.com',
4291-
title: {
4292-
selectors: [['meta[name="title"]', 'value']]
4293-
},
4294-
author: {
4295-
selectors: [['meta[name="article_author"]', 'value']]
4296-
},
4297-
date_published: {
4298-
selectors: [['meta[name="article_date_original"]', 'value']],
4299-
timezone: 'America/New_York'
4300-
},
4301-
lead_image_url: {
4302-
selectors: [['meta[name="og:image"]', 'value']]
4303-
},
4304-
content: {
4305-
selectors: ['.entry-content'],
4306-
// Is there anything in the content you selected that needs transformed
4307-
// before it's consumable content? E.g., unusual lazy loaded images
4308-
transforms: {},
4309-
// Is there anything that is in the result that shouldn't be?
4310-
// The clean selectors will remove anything that matches from
4311-
// the result
4312-
clean: []
4313-
}
4314-
};
4315-
43164223
var WwwInquisitrComExtractor = {
43174224
domain: 'www.inquisitr.com',
43184225
title: {
@@ -6185,14 +6092,66 @@ var PostlightComExtractor = {
61856092
selectors: [['meta[name="og:image"]', 'value']]
61866093
},
61876094
content: {
6188-
selectors: ['article.body'],
6095+
selectors: ['main.post'],
6096+
// Is there anything in the content you selected that needs transformed
6097+
// before it's consumable content? E.g., unusual lazy loaded images
6098+
transforms: {},
6099+
// Is there anything that is in the result that shouldn't be?
6100+
// The clean selectors will remove anything that matches from
6101+
// the result
6102+
clean: ['section.pl-post-link', 'aside', 'section.insights_featured_case_studies']
6103+
}
6104+
};
6105+
6106+
var WwwInvestmentexecutiveComExtractor = {
6107+
domain: 'www.investmentexecutive.com',
6108+
title: {
6109+
selectors: ['h1']
6110+
},
6111+
author: {
6112+
selectors: ['div[itemprop="author"]']
6113+
},
6114+
date_published: {
6115+
selectors: [['meta[itemprop="datePublished"]', 'value']]
6116+
},
6117+
dek: {
6118+
selectors: [['meta[name="og:description"]', 'value']]
6119+
},
6120+
lead_image_url: {
6121+
selectors: [['meta[name="og:image"]', 'value']]
6122+
},
6123+
content: {
6124+
selectors: ['section.article-body'],
6125+
clean: ['.hidden']
6126+
}
6127+
};
6128+
6129+
var WwwCbcCaExtractor = {
6130+
domain: 'www.cbc.ca',
6131+
title: {
6132+
selectors: ['h1']
6133+
},
6134+
author: {
6135+
selectors: ['.authorText', '.bylineDetails']
6136+
},
6137+
date_published: {
6138+
selectors: [['.timeStamp[datetime]', 'datetime']]
6139+
},
6140+
dek: {
6141+
selectors: ['.deck']
6142+
},
6143+
lead_image_url: {
6144+
selectors: [['meta[name="og:image"]', 'value']]
6145+
},
6146+
content: {
6147+
selectors: ['.story'],
61896148
// Is there anything in the content you selected that needs transformed
61906149
// before it's consumable content? E.g., unusual lazy loaded images
61916150
transforms: {},
61926151
// Is there anything that is in the result that shouldn't be?
61936152
// The clean selectors will remove anything that matches from
61946153
// the result
6195-
clean: ['section.pl-post-link']
6154+
clean: []
61966155
}
61976156
};
61986157

@@ -6265,9 +6224,7 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
62656224
WwwSiComExtractor: WwwSiComExtractor,
62666225
WwwRawstoryComExtractor: WwwRawstoryComExtractor,
62676226
WwwCnetComExtractor: WwwCnetComExtractor,
6268-
WwwCinemablendComExtractor: WwwCinemablendComExtractor,
62696227
WwwTodayComExtractor: WwwTodayComExtractor,
6270-
WwwHowtogeekComExtractor: WwwHowtogeekComExtractor,
62716228
WwwAlComExtractor: WwwAlComExtractor,
62726229
WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
62736230
WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
@@ -6276,7 +6233,6 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
62766233
ScienceflyComExtractor: ScienceflyComExtractor,
62776234
HellogigglesComExtractor: HellogigglesComExtractor,
62786235
ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
6279-
WwwNjComExtractor: WwwNjComExtractor,
62806236
WwwInquisitrComExtractor: WwwInquisitrComExtractor,
62816237
WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
62826238
FortuneComExtractor: FortuneComExtractor,
@@ -6343,7 +6299,9 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
63436299
ArstechnicaComExtractor: ArstechnicaComExtractor,
63446300
WwwNdtvComExtractor: WwwNdtvComExtractor,
63456301
SpektrumExtractor: SpektrumExtractor,
6346-
PostlightComExtractor: PostlightComExtractor
6302+
PostlightComExtractor: PostlightComExtractor,
6303+
WwwInvestmentexecutiveComExtractor: WwwInvestmentexecutiveComExtractor,
6304+
WwwCbcCaExtractor: WwwCbcCaExtractor
63476305
});
63486306

63496307
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {

dist/mercury.web.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

fixtures/www.cinemablend.com/1482432215722.html

Lines changed: 0 additions & 1 deletion
This file was deleted.

fixtures/www.howtogeek.com/1482438125052.html

Lines changed: 0 additions & 7 deletions
This file was deleted.

fixtures/www.nj.com/1481666201503.html

Lines changed: 0 additions & 56 deletions
This file was deleted.

src/extractors/custom/index.js

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,7 @@ export * from './www.androidcentral.com';
6464
export * from './www.si.com';
6565
export * from './www.rawstory.com';
6666
export * from './www.cnet.com';
67-
export * from './www.cinemablend.com';
6867
export * from './www.today.com';
69-
export * from './www.howtogeek.com';
7068
export * from './www.al.com';
7169
export * from './www.thepennyhoarder.com';
7270
export * from './www.westernjournalism.com';
@@ -75,7 +73,6 @@ export * from './www.americanow.com';
7573
export * from './sciencefly.com';
7674
export * from './hellogiggles.com';
7775
export * from './thoughtcatalog.com';
78-
export * from './www.nj.com';
7976
export * from './www.inquisitr.com';
8077
export * from './www.nbcnews.com';
8178
export * from './fortune.com';

src/extractors/custom/www.cinemablend.com/index.js

Lines changed: 0 additions & 34 deletions
This file was deleted.

0 commit comments

Comments
 (0)