@@ -2227,41 +2227,29 @@ var LittleThingsExtractor = {
22272227 excerpt : null
22282228} ;
22292229
2230- // Rename CustomExtractor
2231- // to fit your publication
2232- // (e.g., NYTimesExtractor)
22332230var PoliticoExtractor = {
22342231 domain : 'www.politico.com' ,
22352232 title : {
2236- selectors : [ // enter title selectors
2237- [ 'meta[name="og:title"]' , 'value' ] ]
2233+ selectors : [ [ 'meta[name="og:title"]' , 'value' ] ]
22382234 } ,
22392235 author : {
2240- selectors : [ '.story-main-content .byline .vcard' ]
2236+ selectors : [ [ 'div[itemprop="author"] meta[itemprop="name"]' , 'value' ] , '.story-meta__authors .vcard' , '.story-main-content .byline .vcard' ]
22412237 } ,
22422238 content : {
2243- selectors : [ // enter content selectors
2244- '.story-main-content' , '.content-group' , '.story-core' , '.story-text' ] ,
2245- // Is there anything in the content you selected that needs transformed
2246- // before it's consumable content? E.g., unusual lazy loaded images
2239+ selectors : [ [ '.story-text' ] , '.story-main-content' , '.story-core' ] ,
22472240 transforms : [ ] ,
2248- // Is there anything that is in the result that shouldn't be?
2249- // The clean selectors will remove anything that matches from
2250- // the result
2251- clean : [ 'figcaption' ]
2241+ clean : [ 'figcaption' , '.story-meta' , '.ad' ]
22522242 } ,
22532243 date_published : {
2254- selectors : [ [ '.story-main-content .timestamp time[datetime]' , 'datetime' ] ]
2244+ selectors : [ [ 'time[itemprop="datePublished"]' , 'datetime' ] , [ '.story-meta__details time[datetime]' , 'datetime' ] , [ '.story-main-content .timestamp time[datetime]' , 'datetime' ] ] ,
2245+ timezone : 'America/New_York'
22552246 } ,
22562247 lead_image_url : {
2257- selectors : [ // enter lead_image_url selectors
2258- [ 'meta[name="og:image"]' , 'value' ] ]
2248+ selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
22592249 } ,
22602250 dek : {
2261- selectors : [ ]
2262- } ,
2263- next_page_url : null ,
2264- excerpt : null
2251+ selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
2252+ }
22652253} ;
22662254
22672255var DeadspinExtractor = {
@@ -3980,33 +3968,6 @@ var WwwCnetComExtractor = {
39803968 }
39813969} ;
39823970
3983- var WwwCinemablendComExtractor = {
3984- domain : 'www.cinemablend.com' ,
3985- title : {
3986- selectors : [ '.story_title' ]
3987- } ,
3988- author : {
3989- selectors : [ '.author' ]
3990- } ,
3991- date_published : {
3992- selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ] ,
3993- timezone : 'EST'
3994- } ,
3995- lead_image_url : {
3996- selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
3997- } ,
3998- content : {
3999- selectors : [ 'div#wrap_left_content' ] ,
4000- // Is there anything in the content you selected that needs transformed
4001- // before it's consumable content? E.g., unusual lazy loaded images
4002- transforms : { } ,
4003- // Is there anything that is in the result that shouldn't be?
4004- // The clean selectors will remove anything that matches from
4005- // the result
4006- clean : [ ]
4007- }
4008- } ;
4009-
40103971var WwwTodayComExtractor = {
40113972 domain : 'www.today.com' ,
40123973 title : {
@@ -4033,33 +3994,6 @@ var WwwTodayComExtractor = {
40333994 }
40343995} ;
40353996
4036- var WwwHowtogeekComExtractor = {
4037- domain : 'www.howtogeek.com' ,
4038- title : {
4039- selectors : [ 'title' ]
4040- } ,
4041- author : {
4042- selectors : [ '#authorinfobox a' ]
4043- } ,
4044- date_published : {
4045- selectors : [ '#authorinfobox + div li' ] ,
4046- timezone : 'GMT'
4047- } ,
4048- lead_image_url : {
4049- selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
4050- } ,
4051- content : {
4052- selectors : [ '.thecontent' ] ,
4053- // Is there anything in the content you selected that needs transformed
4054- // before it's consumable content? E.g., unusual lazy loaded images
4055- transforms : { } ,
4056- // Is there anything that is in the result that shouldn't be?
4057- // The clean selectors will remove anything that matches from
4058- // the result
4059- clean : [ ]
4060- }
4061- } ;
4062-
40633997var WwwAlComExtractor = {
40643998 domain : 'www.al.com' ,
40653999 title : {
@@ -4286,33 +4220,6 @@ var ThoughtcatalogComExtractor = {
42864220 }
42874221} ;
42884222
4289- var WwwNjComExtractor = {
4290- domain : 'www.nj.com' ,
4291- title : {
4292- selectors : [ [ 'meta[name="title"]' , 'value' ] ]
4293- } ,
4294- author : {
4295- selectors : [ [ 'meta[name="article_author"]' , 'value' ] ]
4296- } ,
4297- date_published : {
4298- selectors : [ [ 'meta[name="article_date_original"]' , 'value' ] ] ,
4299- timezone : 'America/New_York'
4300- } ,
4301- lead_image_url : {
4302- selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
4303- } ,
4304- content : {
4305- selectors : [ '.entry-content' ] ,
4306- // Is there anything in the content you selected that needs transformed
4307- // before it's consumable content? E.g., unusual lazy loaded images
4308- transforms : { } ,
4309- // Is there anything that is in the result that shouldn't be?
4310- // The clean selectors will remove anything that matches from
4311- // the result
4312- clean : [ ]
4313- }
4314- } ;
4315-
43164223var WwwInquisitrComExtractor = {
43174224 domain : 'www.inquisitr.com' ,
43184225 title : {
@@ -6185,14 +6092,66 @@ var PostlightComExtractor = {
61856092 selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
61866093 } ,
61876094 content : {
6188- selectors : [ 'article.body' ] ,
6095+ selectors : [ 'main.post' ] ,
6096+ // Is there anything in the content you selected that needs transformed
6097+ // before it's consumable content? E.g., unusual lazy loaded images
6098+ transforms : { } ,
6099+ // Is there anything that is in the result that shouldn't be?
6100+ // The clean selectors will remove anything that matches from
6101+ // the result
6102+ clean : [ 'section.pl-post-link' , 'aside' , 'section.insights_featured_case_studies' ]
6103+ }
6104+ } ;
6105+
6106+ var WwwInvestmentexecutiveComExtractor = {
6107+ domain : 'www.investmentexecutive.com' ,
6108+ title : {
6109+ selectors : [ 'h1' ]
6110+ } ,
6111+ author : {
6112+ selectors : [ 'div[itemprop="author"]' ]
6113+ } ,
6114+ date_published : {
6115+ selectors : [ [ 'meta[itemprop="datePublished"]' , 'value' ] ]
6116+ } ,
6117+ dek : {
6118+ selectors : [ [ 'meta[name="og:description"]' , 'value' ] ]
6119+ } ,
6120+ lead_image_url : {
6121+ selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
6122+ } ,
6123+ content : {
6124+ selectors : [ 'section.article-body' ] ,
6125+ clean : [ '.hidden' ]
6126+ }
6127+ } ;
6128+
6129+ var WwwCbcCaExtractor = {
6130+ domain : 'www.cbc.ca' ,
6131+ title : {
6132+ selectors : [ 'h1' ]
6133+ } ,
6134+ author : {
6135+ selectors : [ '.authorText' , '.bylineDetails' ]
6136+ } ,
6137+ date_published : {
6138+ selectors : [ [ '.timeStamp[datetime]' , 'datetime' ] ]
6139+ } ,
6140+ dek : {
6141+ selectors : [ '.deck' ]
6142+ } ,
6143+ lead_image_url : {
6144+ selectors : [ [ 'meta[name="og:image"]' , 'value' ] ]
6145+ } ,
6146+ content : {
6147+ selectors : [ '.story' ] ,
61896148 // Is there anything in the content you selected that needs transformed
61906149 // before it's consumable content? E.g., unusual lazy loaded images
61916150 transforms : { } ,
61926151 // Is there anything that is in the result that shouldn't be?
61936152 // The clean selectors will remove anything that matches from
61946153 // the result
6195- clean : [ 'section.pl-post-link' ]
6154+ clean : [ ]
61966155 }
61976156} ;
61986157
@@ -6265,9 +6224,7 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
62656224 WwwSiComExtractor : WwwSiComExtractor ,
62666225 WwwRawstoryComExtractor : WwwRawstoryComExtractor ,
62676226 WwwCnetComExtractor : WwwCnetComExtractor ,
6268- WwwCinemablendComExtractor : WwwCinemablendComExtractor ,
62696227 WwwTodayComExtractor : WwwTodayComExtractor ,
6270- WwwHowtogeekComExtractor : WwwHowtogeekComExtractor ,
62716228 WwwAlComExtractor : WwwAlComExtractor ,
62726229 WwwThepennyhoarderComExtractor : WwwThepennyhoarderComExtractor ,
62736230 WwwWesternjournalismComExtractor : WwwWesternjournalismComExtractor ,
@@ -6276,7 +6233,6 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
62766233 ScienceflyComExtractor : ScienceflyComExtractor ,
62776234 HellogigglesComExtractor : HellogigglesComExtractor ,
62786235 ThoughtcatalogComExtractor : ThoughtcatalogComExtractor ,
6279- WwwNjComExtractor : WwwNjComExtractor ,
62806236 WwwInquisitrComExtractor : WwwInquisitrComExtractor ,
62816237 WwwNbcnewsComExtractor : WwwNbcnewsComExtractor ,
62826238 FortuneComExtractor : FortuneComExtractor ,
@@ -6343,7 +6299,9 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
63436299 ArstechnicaComExtractor : ArstechnicaComExtractor ,
63446300 WwwNdtvComExtractor : WwwNdtvComExtractor ,
63456301 SpektrumExtractor : SpektrumExtractor ,
6346- PostlightComExtractor : PostlightComExtractor
6302+ PostlightComExtractor : PostlightComExtractor ,
6303+ WwwInvestmentexecutiveComExtractor : WwwInvestmentexecutiveComExtractor ,
6304+ WwwCbcCaExtractor : WwwCbcCaExtractor
63476305} ) ;
63486306
63496307var Extractors = _Object$keys ( CustomExtractors ) . reduce ( function ( acc , key ) {
0 commit comments