Skip to content
This repository was archived by the owner on Jul 23, 2025. It is now read-only.

Commit 14f7ac3

Browse files
author
Edinei Cavalcanti
committed
Merge pull request #5 from clochix/master
Update MDN scraping scripts and add Web APIs
2 parents 8885525 + a36a2e2 commit 14f7ac3

File tree

8 files changed

+217
-49
lines changed

8 files changed

+217
-49
lines changed

scraper/css-mdn.js

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,28 @@
11
var requirejs = require('requirejs');
22

33
requirejs([
4-
'step',
54
'spider',
65
'underscore',
76
'cheerio',
87
'../models/sectionscrape',
98
'path',
109
'fs'
11-
], function(step, spider, _, cheerio, SectionScrape, path, fs) {
10+
], function(spider, _, cheerio, SectionScrape, path, fs) {
1211

1312
var results = [];
1413

1514
var spidey = spider();
1615

1716
// use this to visit all links on a page
1817
var visitLinks = function($) {
19-
$('a').each(function() {
18+
$('a:not([class*=new])').each(function() {
2019
var href = $(this).attr('href');
21-
spidey.get(href);
20+
if (href && href.substr(0, 4) !== 'http') {
21+
href = 'https://developer.mozilla.org' + href;
22+
}
23+
if (href && href.indexOf('$') === -1 && href.indexOf('?') === -1 && /Web\/CSS/.exec(href.split('=')[0]) !== null) {
24+
spidey.get(href);
25+
}
2226
});
2327
};
2428

@@ -28,31 +32,29 @@ requirejs([
2832
var file = fs.openSync(filename,'w');
2933

3034
// main index of mdn's css docs
31-
spidey.route('developer.mozilla.org', '/en/CSS_Reference', function ($) {
35+
spidey.route('developer.mozilla.org', '/en-US/docs/tag/CSS', function ($) {
3236
visitLinks($);
3337
});
3438

3539
var blacklist = [
36-
'https://developer.mozilla.org/en/CSS/CSS_Reference'
37-
, 'https://developer.mozilla.org/en/CSS/CSS_Reference/Property_Template'
3840
];
3941

4042
// some urls redirect to other pages w/o changing the url (for example: https://developer.mozilla.org/en/CSS/-moz-scrollbars-none)
4143
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
4244
var titles = [];
4345

44-
spidey.route('developer.mozilla.org', /\/en\/CSS\/*/, function ($, url) {
45-
if (_.include(blacklist,url)) return;
46+
spidey.route('developer.mozilla.org', /\/en\-US\/docs\/Web\/CSS\/*/, function ($, url) {
47+
if ( _.include(blacklist,url) ) return;
4648
visitLinks($);
4749

4850
console.log('---------');
4951
console.log('scraping:',url);
5052

51-
var title = $('article .page-title h1').text().trim();
53+
var title = $('h1').text().trim();
5254
if ( title === '' || title === null ) {
5355
console.log('ERROR: could not get title, skipping');
5456
return;
55-
} else if ( _.indexOf(titles,title) !== -1 ) {
57+
} else if ( _.include(titles,title) ) {
5658
console.log('WARNING: already scraped something with this title, skipping');
5759
return;
5860
}
@@ -66,15 +68,15 @@ requirejs([
6668
scrapeData['sectionHTMLs'] = [];
6769

6870
// get all section ids
69-
var ids = _.map($('[id^=section_]'), function(div) { return div.attribs.id } );
71+
var ids = _.map($('article[id]'), function(div) { return div.attribs.id } );
7072
if ( ids.length === 0 ) {
7173
console.log('WARNING: no sections...');
7274
return;
7375
}
7476

7577
for ( var i = 0; i < ids.length; i++ ) {
7678
// load the section html as its own jquery object
77-
var $section = cheerio.load($('[id^=' + ids[i] + ']').html());
79+
var $section = cheerio.load($('[id^="' + ids[i] + '"]').html());
7880

7981
// strip scripts
8082
$section('script').remove();
@@ -99,7 +101,7 @@ requirejs([
99101
});
100102

101103
// start 'er up
102-
spidey.get('https://developer.mozilla.org/en/CSS_Reference').log('info');
104+
spidey.get('https://developer.mozilla.org/en-US/docs/tag/CSS').log('info');
103105

104106
process.on('exit', function () {
105107
fs.writeSync(file,JSON.stringify(results,null,'\t'));

scraper/html-mdn.js

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,28 @@
11
var requirejs = require('requirejs');
22

33
requirejs([
4-
'step',
54
'spider',
65
'underscore',
76
'cheerio',
87
'../models/sectionscrape',
98
'path',
109
'fs'
11-
], function(step, spider, _, cheerio, SectionScrape, path, fs) {
10+
], function(spider, _, cheerio, SectionScrape, path, fs) {
1211

1312
var results = [];
1413

1514
var spidey = spider();
1615

1716
// use this to visit all links on a page
1817
var visitLinks = function($) {
19-
$('a').each(function() {
18+
$('a:not([class*=new])').each(function() {
2019
var href = $(this).attr('href');
21-
spidey.get(href);
20+
if (href && href.substr(0, 4) !== 'http') {
21+
href = 'https://developer.mozilla.org' + href;
22+
}
23+
if (href && href.indexOf('$') === -1 /*&& href.indexOf('?') === -1*/ && /Web\/HTML\/Element/.exec(href.split('=')[0]) !== null) {
24+
spidey.get(href);
25+
}
2226
});
2327
};
2428

@@ -28,7 +32,7 @@ requirejs([
2832
var file = fs.openSync(filename,'w');
2933

3034
// main index of mdn's html docs
31-
spidey.route('developer.mozilla.org', '/en/HTML/Element', function ($) {
35+
spidey.route('developer.mozilla.org', '/en-US/docs/tag/HTML', function ($) {
3236
visitLinks($);
3337
});
3438

@@ -40,18 +44,18 @@ requirejs([
4044
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
4145
var titles = [];
4246

43-
spidey.route('developer.mozilla.org', /\/en\/HTML\/Element\/*/, function ($, url) {
44-
if ( _.indexOf(blacklist,url) !== -1 ) return;
47+
spidey.route('developer.mozilla.org', /\/en\-US\/docs\/Web\/HTML\/Element\/*/, function ($, url) {
48+
if ( _.include(blacklist,url) ) return;
4549
visitLinks($);
4650

4751
console.log('---------');
4852
console.log('scraping:',url);
4953

50-
var title = $('article .page-title h1').text().trim();
54+
var title = $('h1').text().trim().replace(/<|>/g, '');
5155
if ( title === '' || title === null ) {
5256
console.log('ERROR: could not get title, skipping');
5357
return;
54-
} else if ( _.indexOf(titles,title) !== -1 ) {
58+
} else if ( _.include(titles,title) ) {
5559
console.log('WARNING: already scraped something with this title, skipping');
5660
return;
5761
}
@@ -65,15 +69,15 @@ requirejs([
6569
scrapeData['sectionHTMLs'] = [];
6670

6771
// get all section ids
68-
var ids = _.map($('[id^=section_]'), function(div) { return div.attribs.id } );
72+
var ids = _.map($('article[id]'), function(div) { return div.attribs.id } );
6973
if ( ids.length === 0 ) {
7074
console.log('WARNING: no sections...');
7175
return;
7276
}
7377

7478
for ( var i = 0; i < ids.length; i++ ) {
7579
// load the section html as its own jquery object
76-
var $section = cheerio.load($('[id^=' + ids[i] + ']').html());
80+
var $section = cheerio.load($('[id^="' + ids[i] + '"]').html());
7781

7882
// strip scripts
7983
$section('script').remove();
@@ -98,7 +102,7 @@ requirejs([
98102
});
99103

100104
// start 'er up
101-
spidey.get('https://developer.mozilla.org/en/HTML/Element').log('info');
105+
spidey.get('https://developer.mozilla.org/en-US/docs/tag/HTML').log('info');
102106

103107
process.on('exit', function () {
104108
fs.writeSync(file,JSON.stringify(results,null,'\t'));

scraper/js-mdn.js

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,14 @@ requirejs([
1515

1616
// use this to visit all links on a page
1717
var visitLinks = function($) {
18-
$('a').each(function() {
18+
$('a:not([class*=new])').each(function() {
1919
var href = $(this).attr('href');
20-
spidey.get(href);
20+
if (href && href.substr(0, 4) !== 'http') {
21+
href = 'https://developer.mozilla.org' + href;
22+
}
23+
if (href && href.indexOf('$') === -1 && href.indexOf('?') === -1 && /Web\/JavaScript\/Reference/.exec(href.split('=')[0]) !== null) {
24+
spidey.get(href);
25+
}
2126
});
2227
};
2328

@@ -27,32 +32,29 @@ requirejs([
2732
var file = fs.openSync(filename,'w');
2833

2934
// main index of mdn's js docs
30-
spidey.route('developer.mozilla.org', '/en/JavaScript/Reference', function ($) {
35+
spidey.route('developer.mozilla.org', '/en-US/docs/tag/JavaScript', function ($) {
3136
visitLinks($);
3237
});
3338

3439
var blacklist = [
35-
// 'https://developer.mozilla.org/en/JavaScript/Reference'
3640
];
3741

3842
// some urls redirect to other pages w/o changing the url (for example: https://developer.mozilla.org/en/CSS/-moz-scrollbars-none)
3943
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
4044
var titles = [];
4145

42-
spidey.route('developer.mozilla.org', /(\/en\/JavaScript_typed_arrays|\/en\/JavaScript\/Reference\/(Global_Objects|Statement|Operators))\/*/, function ($, url) {
43-
if ( _.include(blacklist, url) ) return;
46+
spidey.route('developer.mozilla.org', /\/en\-US\/docs\/Web\/JavaScript\/Reference\/*/, function ($, url) {
47+
if ( _.include(blacklist,url) ) return;
4448
visitLinks($);
4549

4650
console.log('---------');
4751
console.log('scraping:',url);
4852

49-
var title = $('article .page-title h1').text().trim();
50-
if ( /Global_Objects/.test(url) && url.split('Global_Objects/').length > 1)
51-
title = url.split('Global_Objects/')[1].replace(/\//g, '.');
53+
var title = $('h1').text().trim();
5254
if ( title === '' || title === null ) {
5355
console.log('ERROR: could not get title, skipping');
5456
return;
55-
} else if ( _.include(titles, title) ) {
57+
} else if ( _.include(titles,title) ) {
5658
console.log('WARNING: already scraped something with this title, skipping');
5759
return;
5860
}
@@ -66,15 +68,15 @@ requirejs([
6668
scrapeData['sectionHTMLs'] = [];
6769

6870
// get all section ids
69-
var ids = _.map($('[id^=section_]'), function(div) { return div.attribs.id } );
71+
var ids = _.map($('article[id]'), function(div) { return div.attribs.id } );
7072
if ( ids.length === 0 ) {
7173
console.log('WARNING: no sections...');
7274
return;
7375
}
7476

7577
for ( var i = 0; i < ids.length; i++ ) {
7678
// load the section html as its own jquery object
77-
var $section = cheerio.load($('[id^=' + ids[i] + ']').html());
79+
var $section = cheerio.load($('[id^="' + ids[i] + '"]').html());
7880

7981
// strip scripts
8082
$section('script').remove();
@@ -99,7 +101,7 @@ requirejs([
99101
});
100102

101103
// start 'er up
102-
spidey.get('https://developer.mozilla.org/en/JavaScript/Reference').log('info');
104+
spidey.get('https://developer.mozilla.org/en-US/docs/tag/JavaScript').log('info');
103105

104106
process.on('exit', function () {
105107
fs.writeSync(file,JSON.stringify(results,null,'\t'));

scraper/webapi-mdn.js

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
var requirejs = require('requirejs');
2+
3+
requirejs([
4+
'spider',
5+
'underscore',
6+
'cheerio',
7+
'../models/sectionscrape',
8+
'path',
9+
'fs'
10+
], function(spider, _, cheerio, SectionScrape, path, fs) {
11+
12+
var results = [];
13+
14+
var spidey = spider();
15+
16+
// use this to visit all links on a page
17+
var visitLinks = function($) {
18+
$('a:not([class*=new])').each(function() {
19+
var href = $(this).attr('href');
20+
if (href && href.substr(0, 4) !== 'http') {
21+
href = 'https://developer.mozilla.org' + href;
22+
}
23+
if (href && href.indexOf('$') === -1 && href.indexOf('?') === -1 && /Web.?API/.exec(href.split('=')[0]) !== null) {
24+
spidey.get(href);
25+
}
26+
});
27+
};
28+
29+
// file where we'll dump the json
30+
var filename = path.dirname(__filename) + '/../static/data/webapi-mdn.json';
31+
console.log('dumping to ' + filename);
32+
var file = fs.openSync(filename,'w');
33+
34+
// main index of mdn's Web API docs
35+
spidey.route('developer.mozilla.org', '/en-US/docs/tag/WebAPI', function ($) {
36+
visitLinks($);
37+
});
38+
39+
var blacklist = [
40+
];
41+
42+
// some urls redirect to other pages w/o changing the url (for example: https://developer.mozilla.org/en/CSS/-moz-scrollbars-none)
43+
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
44+
var titles = [];
45+
46+
spidey.route('developer.mozilla.org', /\/en\-US\/docs\/Web.?API\/*/, function ($, url) {
47+
if ( _.include(blacklist,url) ) return;
48+
visitLinks($);
49+
50+
console.log('---------');
51+
console.log('scraping:',url);
52+
53+
var title = $('h1').text().trim();
54+
if ( title === '' || title === null ) {
55+
console.log('ERROR: could not get title, skipping');
56+
return;
57+
} else if ( _.include(titles,title) ) {
58+
console.log('WARNING: already scraped something with this title, skipping');
59+
return;
60+
}
61+
62+
console.log('title:',title);
63+
64+
var scrapeData = new SectionScrape();
65+
scrapeData['title'] = title;
66+
scrapeData['url'] = url;
67+
scrapeData['sectionNames'] = [];
68+
scrapeData['sectionHTMLs'] = [];
69+
70+
// get all section ids
71+
var ids = _.map($('article[id]'), function(div) { return div.attribs.id } );
72+
if ( ids.length === 0 ) {
73+
console.log('WARNING: no sections...');
74+
return;
75+
}
76+
77+
for ( var i = 0; i < ids.length; i++ ) {
78+
// load the section html as its own jquery object
79+
var $section = cheerio.load($('[id^="' + ids[i] + '"]').html());
80+
81+
// strip scripts
82+
$section('script').remove();
83+
var sectionName = "";
84+
85+
// TODO find relative hrefs and turn them into absolute hrefs
86+
87+
// find the title of the section--mdn isn't very consistent with what size headers they use
88+
_.each([1,2,3,4],function(h) {
89+
var headers = $section('h' + h);
90+
if ( sectionName === "" && headers.length > 0 ) {
91+
sectionName = headers.text();
92+
}
93+
});
94+
95+
scrapeData['sectionNames'].push(sectionName);
96+
scrapeData['sectionHTMLs'].push($section.html());
97+
}
98+
99+
results.push(scrapeData.toJSON());
100+
titles.push(title);
101+
});
102+
103+
// start 'er up
104+
spidey.get('https://developer.mozilla.org/en-US/docs/tag/WebAPI').log('info');
105+
106+
process.on('exit', function () {
107+
fs.writeSync(file,JSON.stringify(results,null,'\t'));
108+
console.log('DONE');
109+
});
110+
return;
111+
});

static/dochub.appcache

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,4 @@ data/php-ext.json
7373
data/python.json
7474
data/python3.json
7575
data/xslt-w3.json
76+
data/webapi-mdn.json

0 commit comments

Comments
 (0)