Skip to content
This repository was archived by the owner on Jul 23, 2025. It is now read-only.

Commit a36a2e2

Browse files
committed
Scrap Web API from MDN
1 parent eda6f8d commit a36a2e2

File tree

5 files changed

+170
-10
lines changed

5 files changed

+170
-10
lines changed

scraper/webapi-mdn.js

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
var requirejs = require('requirejs');
2+
3+
requirejs([
4+
'spider',
5+
'underscore',
6+
'cheerio',
7+
'../models/sectionscrape',
8+
'path',
9+
'fs'
10+
], function(spider, _, cheerio, SectionScrape, path, fs) {
11+
12+
var results = [];
13+
14+
var spidey = spider();
15+
16+
// use this to visit all links on a page
17+
var visitLinks = function($) {
18+
$('a:not([class*=new])').each(function() {
19+
var href = $(this).attr('href');
20+
if (href && href.substr(0, 4) !== 'http') {
21+
href = 'https://developer.mozilla.org' + href;
22+
}
23+
if (href && href.indexOf('$') === -1 && href.indexOf('?') === -1 && /Web.?API/.exec(href.split('=')[0]) !== null) {
24+
spidey.get(href);
25+
}
26+
});
27+
};
28+
29+
// file where we'll dump the json
30+
var filename = path.dirname(__filename) + '/../static/data/webapi-mdn.json';
31+
console.log('dumping to ' + filename);
32+
var file = fs.openSync(filename,'w');
33+
34+
// main index of mdn's Web API docs
35+
spidey.route('developer.mozilla.org', '/en-US/docs/tag/WebAPI', function ($) {
36+
visitLinks($);
37+
});
38+
39+
var blacklist = [
40+
];
41+
42+
// some urls redirect to other pages w/o changing the url (for example: https://developer.mozilla.org/en/CSS/-moz-scrollbars-none)
43+
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
44+
var titles = [];
45+
46+
spidey.route('developer.mozilla.org', /\/en\-US\/docs\/Web.?API\/*/, function ($, url) {
47+
if ( _.include(blacklist,url) ) return;
48+
visitLinks($);
49+
50+
console.log('---------');
51+
console.log('scraping:',url);
52+
53+
var title = $('h1').text().trim();
54+
if ( title === '' || title === null ) {
55+
console.log('ERROR: could not get title, skipping');
56+
return;
57+
} else if ( _.include(titles,title) ) {
58+
console.log('WARNING: already scraped something with this title, skipping');
59+
return;
60+
}
61+
62+
console.log('title:',title);
63+
64+
var scrapeData = new SectionScrape();
65+
scrapeData['title'] = title;
66+
scrapeData['url'] = url;
67+
scrapeData['sectionNames'] = [];
68+
scrapeData['sectionHTMLs'] = [];
69+
70+
// get all section ids
71+
var ids = _.map($('article[id]'), function(div) { return div.attribs.id } );
72+
if ( ids.length === 0 ) {
73+
console.log('WARNING: no sections...');
74+
return;
75+
}
76+
77+
for ( var i = 0; i < ids.length; i++ ) {
78+
// load the section html as its own jquery object
79+
var $section = cheerio.load($('[id^="' + ids[i] + '"]').html());
80+
81+
// strip scripts
82+
$section('script').remove();
83+
var sectionName = "";
84+
85+
// TODO find relative hrefs and turn them into absolute hrefs
86+
87+
// find the title of the section--mdn isn't very consistent with what size headers they use
88+
_.each([1,2,3,4],function(h) {
89+
var headers = $section('h' + h);
90+
if ( sectionName === "" && headers.length > 0 ) {
91+
sectionName = headers.text();
92+
}
93+
});
94+
95+
scrapeData['sectionNames'].push(sectionName);
96+
scrapeData['sectionHTMLs'].push($section.html());
97+
}
98+
99+
results.push(scrapeData.toJSON());
100+
titles.push(title);
101+
});
102+
103+
// start 'er up
104+
spidey.get('https://developer.mozilla.org/en-US/docs/tag/WebAPI').log('info');
105+
106+
process.on('exit', function () {
107+
fs.writeSync(file,JSON.stringify(results,null,'\t'));
108+
console.log('DONE');
109+
});
110+
return;
111+
});

static/dochub.appcache

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,4 @@ data/php-ext.json
7373
data/python.json
7474
data/python3.json
7575
data/xslt-w3.json
76+
data/webapi-mdn.json
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
define([
2+
'jQuery',
3+
'Underscore',
4+
'Backbone',
5+
'models/sectionscrape'
6+
], function($, _, Backbone, SectionScrape) {
7+
8+
// Handle the following cases:
9+
// -xyz
10+
// ::-xyz
11+
// ::xyz
12+
// :xyz
13+
// @xyz
14+
// <xyz>
15+
var apiPropsPattern = new RegExp("^(\\W*)(.+)$");
16+
17+
var MozDevAPIProps = Backbone.Collection.extend({
18+
url: 'data/webapi-mdn.json',
19+
model: SectionScrape,
20+
21+
comparator: function(model) {
22+
var title = model.get('title');
23+
var results = apiPropsPattern.exec(title);
24+
var prefix = results[1];
25+
var name = results[2];
26+
27+
if (prefix) {
28+
return '2' + title;
29+
} else if (name[0] === name[0].toLowerCase()) {
30+
return '0' + name;
31+
} else {
32+
return '1' + name;
33+
}
34+
},
35+
36+
});
37+
38+
return MozDevAPIProps;
39+
});

static/js/router.js

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ define([
55
'jQuery',
66
'Underscore',
77
'Backbone',
8-
8+
99
// Settings
1010
'settings',
1111

@@ -20,6 +20,7 @@ define([
2020
'collections/mozdevcssprops',
2121
'collections/mdnhtmlelements',
2222
'collections/mdnjsobjs',
23+
'collections/mozdevapiprops',
2324
'collections/mdndomobjs',
2425
'collections/phpexts',
2526
'collections/jqentries',
@@ -31,13 +32,13 @@ define([
3132
// Templates
3233
'text!templates/mdnpage.html',
3334
], function(doc, $, _, Backbone,
34-
Settings,
35+
Settings,
3536
TopNavView, JQuerySearchResultsView, LanguageView, PageScrapedLanguageView,
3637
FullWindowView,
37-
MozDevCSSPropCollection, MDNHtmlElementsCollection, MDNJsObjsCollection,
38+
MozDevCSSPropCollection, MDNHtmlElementsCollection, MDNJsObjsCollection, MozDevAPIPropCollection,
3839
MDNDomObjsCollection, PHPExtensionsCollection, JQEntriesCollection,
39-
XSLTPagesCollection, PythonPagesCollection,
40-
Python3PagesCollection, NodejsPagesCollection,
40+
XSLTPagesCollection, PythonPagesCollection,
41+
Python3PagesCollection, NodejsPagesCollection,
4142
MDNPage) {
4243

4344
var DocHub = Backbone.Router.extend({
@@ -116,7 +117,14 @@ define([
116117
debounceTime: 200,
117118
minQueryLength: 3,
118119
}),
119-
120+
'webapi' : new LanguageView({
121+
languageName: 'WebAPI',
122+
resultsClassNames: 'webapi',
123+
collection: new MozDevAPIPropCollection(),
124+
placeholder: 'Type a Web API name',
125+
mainResultTemplate: MDNPage,
126+
}),
127+
120128
// 'xslt' : new PageScrapedLanguageView({
121129
// languageName: 'XSLT',
122130
// resultsClassNames: 'w3',
@@ -133,7 +141,7 @@ define([
133141
this.renderTopNav = _.once(function() {
134142
self.topNavView = new TopNavView({
135143
el: $('.navbar'),
136-
settings: Settings
144+
settings: Settings
137145
});
138146
self.topNavView.render();
139147
self.topNavView.bind('changeLanguage', self.changeLanguage);

static/js/settings.js

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,18 @@ define([], function(){
99
{ lang: 'php', name: 'PHP' },
1010
{ lang: 'python', name: 'Python' },
1111
{ lang: 'python3', name: 'Python3' },
12-
{ lang: 'nodejs', name: 'Node.js' }
12+
{ lang: 'nodejs', name: 'Node.js' },
13+
{ lang: 'webapi', name: 'WebAPI' }
1314
]
1415
}
15-
16+
1617
Settings.languages.sort(function(a, b){
1718
var nameA=a.name.toLowerCase(), nameB=b.name.toLowerCase()
1819
//sort string ascending
1920
if (nameA < nameB) { return -1 }
2021
if (nameA > nameB) { return 1 }
2122
return 0 //default return value (no sorting)
2223
});
23-
24+
2425
return Settings;
2526
});

0 commit comments

Comments
 (0)