Skip to content

Commit 0665e47

Browse files
committed
Implement recursive option
1 parent f9beaf0 commit 0665e47

File tree

6 files changed

+105
-0
lines changed

6 files changed

+105
-0
lines changed

lib/config/recursive-sources.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module.exports = [
2+
{ selector: 'a', attr: 'href' }
3+
];

lib/scraper.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ var _ = require('underscore');
1010

1111
var defaults = require('./config/defaults');
1212
var types = require('./config/resource-types');
13+
var resursiveSources = require('./config/recursive-sources');
1314
var utils = require('./utils.js');
1415
var request = require('./request');
1516
var Resource = require('./resource');
@@ -141,6 +142,10 @@ Scraper.prototype.prepare = function prepare () {
141142
return new Resource(url, filename);
142143
});
143144

145+
if (self.options.recursive) {
146+
self.options.sources = _.union(self.options.sources, resursiveSources);
147+
}
148+
144149
return ensureDirAsync(self.options.directory);
145150
};
146151

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<a href="/link1.html"></a>
9+
<a href="/link2.html"></a>
10+
<a href="/link3.html"></a>
11+
12+
</body>
13+
</html>
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<a href="/about.html"></a>
9+
10+
</body>
11+
</html>

test/functional/recursive-test.js

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
require('should');
2+
var nock = require('nock');
3+
var fs = require('fs-extra');
4+
var path = require('path');
5+
var _ = require('underscore');
6+
var scraper = require('../../index');
7+
8+
var testDirname = __dirname + '/.recursive';
9+
var mockDirname = __dirname + '/mocks/recursive';
10+
11+
describe('Functional recursive downloading', function() {
12+
13+
beforeEach(function() {
14+
nock.cleanAll();
15+
nock.disableNetConnect();
16+
});
17+
18+
afterEach(function() {
19+
nock.cleanAll();
20+
nock.enableNetConnect();
21+
fs.removeSync(testDirname);
22+
});
23+
24+
it('should follow anchors', function(done) {
25+
var options = {
26+
urls: [ 'http://example.com/' ],
27+
directory: testDirname,
28+
subdirectories: null,
29+
sources: [],
30+
recursive: true
31+
};
32+
33+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
34+
35+
// mock for anchors
36+
nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html');
37+
nock('http://example.com/').get('/link1.html').reply(200, 'content 1');
38+
nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
39+
nock('http://example.com/').get('/link3.html').reply(200, 'content 3');
40+
41+
scraper.scrape(options).then(function() {
42+
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
43+
44+
// index.html anchors loaded
45+
fs.existsSync(testDirname + '/about.html').should.be.eql(true);
46+
47+
// about.html anchors loaded
48+
fs.existsSync(testDirname + '/link1.html').should.be.eql(true);
49+
fs.existsSync(testDirname + '/link2.html').should.be.eql(true);
50+
fs.existsSync(testDirname + '/link3.html').should.be.eql(true);
51+
52+
done();
53+
}).catch(done);
54+
});
55+
});

test/unit/scraper-test.js

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,24 @@ describe('Scraper', function () {
141141
done();
142142
}).catch(done);
143143
});
144+
145+
it('should extend sources if recursive flag is set', function(done) {
146+
var s = new Scraper({
147+
urls: { url: 'http://first-url.com' },
148+
directory: testDirname,
149+
sources: [
150+
{ selector: 'img', attr: 'src' }
151+
],
152+
recursive: true
153+
});
154+
155+
s.prepare().then(function() {
156+
s.options.sources.should.have.length(2);
157+
s.options.sources.should.containEql({ selector: 'img', attr: 'src' });
158+
s.options.sources.should.containEql({ selector: 'a', attr: 'href' });
159+
done();
160+
}).catch(done);
161+
});
144162
});
145163

146164
describe('#load', function() {

0 commit comments

Comments
 (0)