Skip to content

Commit 13b4e52

Browse files
authored
Add maxRecursiveDepth option (#205)
1 parent 978a311 commit 13b4e52

File tree

10 files changed

+271
-87
lines changed

10 files changed

+271
-87
lines changed

README.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,9 @@ scrape(options, (error, result) => {
4848
* [urls](#urls) - urls to download, *required*
4949
* [directory](#directory) - path to save files, *required*
5050
* [sources](#sources) - selects which resources should be downloaded
51-
* [recursive](#recursive) - follow anchors in html files
52-
* [maxDepth](#maxdepth) - maximum depth for dependencies
51+
* [recursive](#recursive) - follow hyperlinks in html files
52+
* [maxRecursiveDepth](#maxrecursivedepth) - maximum depth for hyperlinks
53+
* [maxDepth](#maxdepth) - maximum depth for all dependencies
5354
* [request](#request) - custom options for for [request](https://github.com/request/request)
5455
* [subdirectories](#subdirectories) - subdirectories for file extensions
5556
* [defaultFilename](#defaultfilename) - filename for index page
@@ -96,10 +97,13 @@ scrape({
9697
```
9798

9899
#### recursive
99-
Boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading. Defaults to `false`.
100+
Boolean, if `true` scraper will follow hyperlinks in html files. Don't forget to set `maxRecursiveDepth` to avoid infinite downloading. Defaults to `false`.
101+
102+
#### maxRecursiveDepth
103+
Positive number, maximum allowed depth for hyperlinks. Other dependencies will be saved regardless of their depth. Defaults to `null` - no maximum recursive depth set.
100104

101105
#### maxDepth
102-
Positive number, maximum allowed depth for dependencies. Defaults to `null` - no maximum depth set.
106+
Positive number, maximum allowed depth for all dependencies. Defaults to `null` - no maximum depth set.
103107

104108
#### request
105109
Object, custom options for [request](https://github.com/request/request#requestoptions-callback). Allows to set cookies, userAgent, etc.

lib/config/defaults.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ const config = {
4848
},
4949
urlFilter: null,
5050
recursive: false,
51+
maxRecursiveDepth: null,
5152
maxDepth: null,
5253
ignoreErrors: true,
5354
httpResponseHandler: null,
Lines changed: 58 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,70 @@
1-
var ImgSrcsetTag = require('../path-containers/html-img-srcset-tag');
2-
var CommonTag = require('../path-containers/html-common-tag');
3-
var CssText = require('../path-containers/css-text');
4-
var _ = require('lodash');
5-
var utils = require('../../utils');
1+
'use strict';
62

7-
var pathContainersByRule = [
3+
const ImgSrcsetTag = require('../path-containers/html-img-srcset-tag');
4+
const CommonTag = require('../path-containers/html-common-tag');
5+
const CssText = require('../path-containers/css-text');
6+
const utils = require('../../utils');
7+
8+
const pathContainersByRule = [
89
{ selector: '[style]', attr: 'style', containerClass: CssText },
910
{ selector: 'style', containerClass: CssText },
1011
{ selector: '*[srcset]', attr: 'srcset', containerClass: ImgSrcsetTag }
1112
];
1213

1314
/**
1415
* Represents pair of cheerio element and rule to find text with children resources
15-
* @param {Object} el - cheerio obj for dom element
16-
* @param {Object} rule - rule used to find current element
17-
* @param {string} rule.selector - cheerio selector
18-
* @param {string} rule.attr - attribute to find text which contains resources. if not set - use inner html
19-
* @constructor
2016
*/
21-
function HtmlSourceElement (el, rule) {
22-
this.el = el;
23-
this.rule = rule;
24-
}
17+
class HtmlSourceElement {
18+
/**
19+
* @param {Object} el - cheerio obj for dom element
20+
* @param {Object} rule - rule used to find current element
21+
* @param {string} rule.selector - cheerio selector
22+
* @param {string} rule.attr - attribute to find text which contains resources. if not set - use inner html
23+
*/
24+
constructor (el, rule) {
25+
this.el = el;
26+
this.rule = rule;
27+
}
2528

26-
/**
27-
* Get text from attr or from innerHtml of element based on rule
28-
* @returns {string}
29-
*/
30-
HtmlSourceElement.prototype.getData = function getData () {
31-
var text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text();
32-
return utils.decodeHtmlEntities(text);
33-
};
34-
35-
HtmlSourceElement.prototype.setData = function setData (newData) {
36-
this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData);
37-
};
38-
39-
HtmlSourceElement.prototype.getPathContainerClass = function getPathContainerClass () {
40-
var selectedRule = _.find(pathContainersByRule, (containerByRule) => {
41-
return this.el.is(containerByRule.selector) && this.rule.attr === containerByRule.attr;
42-
});
43-
44-
return selectedRule ? selectedRule.containerClass : CommonTag;
45-
};
46-
47-
HtmlSourceElement.prototype.getPathContainer = function getPathContainer () {
48-
var ContainerClass = this.getPathContainerClass();
49-
var textWithResources = this.getData();
50-
return textWithResources ? new ContainerClass(textWithResources) : null;
51-
};
29+
/**
30+
* Get resource data from element using rule
31+
* @returns {string}
32+
*/
33+
getData () {
34+
const text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text();
35+
return utils.decodeHtmlEntities(text);
36+
}
37+
38+
/**
39+
* Update attribute or inner text of el with new data
40+
* @param {string} newData
41+
*/
42+
setData (newData) {
43+
this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData);
44+
}
45+
46+
/**
47+
* Returns PathContainer instance for element
48+
* @returns {CssText|HtmlCommonTag|HtmlImgSrcSetTag|null}
49+
*/
50+
getPathContainer () {
51+
const selectedRule = this.findMatchedRule(pathContainersByRule);
52+
const ContainerClass = selectedRule ? selectedRule.containerClass : CommonTag;
53+
const textWithResources = this.getData();
54+
return textWithResources ? new ContainerClass(textWithResources) : null;
55+
}
56+
57+
matchesRule (rule) {
58+
return this.el.is(rule.selector) && this.rule.attr === rule.attr;
59+
}
60+
61+
findMatchedRule (rulesArray) {
62+
return rulesArray.find(this.matchesRule, this);
63+
}
64+
65+
toString () {
66+
return JSON.stringify({selector: this.rule.selector, attr: this.rule.attr, data: this.getData()});
67+
}
68+
}
5269

5370
module.exports = HtmlSourceElement;

lib/resource-handler/html/index.js

Lines changed: 48 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,57 @@
1-
var cheerio = require('cheerio');
2-
var Promise = require('bluebird');
3-
var utils = require('../../utils');
4-
var HtmlSourceElement = require('./html-source-element');
5-
6-
function HtmlResourceHandler (options, handleChildrenPaths) {
7-
this.options = options;
8-
this.handleChildrenPaths = handleChildrenPaths;
9-
}
1+
'use strict';
102

11-
HtmlResourceHandler.prototype.handle = function handle (resource) {
12-
var $ = loadTextToCheerio(resource.getText());
13-
prepareToLoad($, resource);
14-
15-
return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource))
16-
.then(function updateResource () {
17-
resource.setText($.html());
18-
return resource;
19-
});
20-
};
21-
22-
HtmlResourceHandler.prototype.loadResourcesForRule = function loadResourcesForRule ($, parentResource, rule) {
23-
var self = this;
24-
var promises = $(rule.selector).map(function loadForElement () {
25-
var el = new HtmlSourceElement($(this), rule);
26-
var pathContainer = el.getPathContainer();
27-
if (!pathContainer) {
28-
return Promise.resolve();
29-
}
30-
return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el));
31-
}).get();
3+
const cheerio = require('cheerio');
4+
const Promise = require('bluebird');
5+
const utils = require('../../utils');
6+
const logger = require('../../logger');
7+
const HtmlSourceElement = require('./html-source-element');
8+
9+
class HtmlResourceHandler {
10+
constructor (options, handleChildrenPaths) {
11+
this.options = options;
12+
this.handleChildrenPaths = handleChildrenPaths;
13+
}
14+
15+
handle (resource) {
16+
const $ = loadTextToCheerio(resource.getText());
17+
prepareToLoad($, resource);
18+
19+
return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource))
20+
.then(function updateResource () {
21+
resource.setText($.html());
22+
return resource;
23+
});
24+
}
3225

33-
return utils.waitAllFulfilled(promises);
34-
};
26+
loadResourcesForRule ($, parentResource, rule) {
27+
const self = this;
28+
const promises = $(rule.selector).map(function loadForElement () {
29+
const el = new HtmlSourceElement($(this), rule);
30+
31+
const isRecursive = self.options.recursiveSources && Boolean(el.findMatchedRule(self.options.recursiveSources));
32+
const isDepthGreaterThanMax = self.options.maxRecursiveDepth && parentResource.getDepth() >= self.options.maxRecursiveDepth;
33+
if (isRecursive && isDepthGreaterThanMax) {
34+
logger.debug(`filtering out ${el} by max recursive depth`);
35+
return Promise.resolve();
36+
}
37+
38+
const pathContainer = el.getPathContainer();
39+
if (!pathContainer) {
40+
return Promise.resolve();
41+
}
42+
return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el));
43+
}).get();
44+
45+
return utils.waitAllFulfilled(promises);
46+
}
47+
}
3548

3649
function prepareToLoad ($, resource) {
3750
$('base').each(function handleBaseTag () {
38-
var el = $(this);
39-
var href = el.attr('href');
51+
const el = $(this);
52+
const href = el.attr('href');
4053
if (href) {
41-
var newUrl = utils.getUrl(resource.getUrl(), href);
54+
const newUrl = utils.getUrl(resource.getUrl(), href);
4255
resource.setUrl(newUrl);
4356
el.remove();
4457
}

lib/resource-handler/index.js

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,14 @@ var utils = require('../utils');
66
var HtmlHandler = require('./html');
77
var CssHandler = require('./css');
88

9-
var supportedOptions = ['prettifyUrls', 'sources', 'defaultFilename'];
9+
var supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename'];
1010

1111
function ResourceHandler (options, context) {
12-
var self = this;
13-
self.options = _.pick(options, supportedOptions);
14-
self.context = context;
12+
this.options = _.pick(options, supportedOptions);
13+
this.context = context;
1514

16-
self.htmlHandler = new HtmlHandler(self.options, self.handleChildrenResources.bind(self));
17-
self.cssHandler = new CssHandler(self.options, self.handleChildrenResources.bind(self));
15+
this.htmlHandler = new HtmlHandler(this.options, this.handleChildrenResources.bind(this));
16+
this.cssHandler = new CssHandler(this.options, this.handleChildrenResources.bind(this));
1817
}
1918

2019
ResourceHandler.prototype.getResourceHandler = function getResourceHandler (resource) {

lib/scraper.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ function Scraper (options) {
2929
});
3030
}
3131

32+
self.options.recursiveSources = recursiveSources;
3233
if (self.options.recursive) {
33-
self.options.sources = _.union(self.options.sources, recursiveSources);
34+
self.options.sources = _.union(self.options.sources, self.options.recursiveSources);
3435
}
3536

3637
logger.info('init with options', self.options);
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
require('should');
2+
const nock = require('nock');
3+
const fs = require('fs-extra');
4+
const scrape = require('../../../index');
5+
6+
const testDirname = __dirname + '/.tmp';
7+
const mockDirname = __dirname + '/mocks';
8+
9+
describe('Functional: maxDepth and maxRecursiveDepth ', () => {
10+
11+
beforeEach(() => {
12+
nock.cleanAll();
13+
nock.disableNetConnect();
14+
});
15+
16+
afterEach(() => {
17+
nock.cleanAll();
18+
nock.enableNetConnect();
19+
fs.removeSync(testDirname);
20+
});
21+
22+
it('should filter out all resources by depth > maxDepth', () => {
23+
const options = {
24+
urls: [ 'http://example.com/' ],
25+
directory: testDirname,
26+
subdirectories: null,
27+
sources: [
28+
{ selector: 'img', attr: 'src' },
29+
{ selector: 'script', attr: 'src' },
30+
{ selector: 'a', attr: 'href' }
31+
],
32+
maxDepth: 2
33+
};
34+
35+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
36+
37+
nock('http://example.com/').get('/depth1.html').replyWithFile(200, mockDirname + '/depth1.html');
38+
nock('http://example.com/').get('/img-depth1.jpg').reply(200, 'img-depth1.jpg');
39+
nock('http://example.com/').get('/script-depth1.js').reply(200, 'script-depth1.js');
40+
41+
nock('http://example.com/').get('/depth2.html').replyWithFile(200, mockDirname + '/depth2.html');
42+
nock('http://example.com/').get('/img-depth2.jpg').reply(200, 'img-depth2.jpg');
43+
nock('http://example.com/').get('/script-depth2.js').reply(200, 'script-depth2.js');
44+
45+
nock('http://example.com/').get('/depth3.html').reply(200, 'OK');
46+
nock('http://example.com/').get('/img-depth3.jpg').reply(200, 'img-depth3.jpg');
47+
nock('http://example.com/').get('/script-depth3.js').reply(200, 'script-depth3.js');
48+
49+
return scrape(options).then(() => {
50+
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
51+
52+
fs.existsSync(testDirname + '/depth1.html').should.be.eql(true);
53+
fs.existsSync(testDirname + '/img-depth1.jpg').should.be.eql(true);
54+
fs.existsSync(testDirname + '/script-depth1.js').should.be.eql(true);
55+
56+
fs.existsSync(testDirname + '/depth2.html').should.be.eql(true);
57+
fs.existsSync(testDirname + '/img-depth2.jpg').should.be.eql(true);
58+
fs.existsSync(testDirname + '/script-depth2.js').should.be.eql(true);
59+
60+
fs.existsSync(testDirname + '/depth3.html').should.be.eql(false);
61+
fs.existsSync(testDirname + '/img-depth3.jpg').should.be.eql(false);
62+
fs.existsSync(testDirname + '/script-depth3.js').should.be.eql(false);
63+
});
64+
});
65+
66+
67+
it('should filter out only anchors by depth > maxRecursiveDepth', () => {
68+
const options = {
69+
urls: [ 'http://example.com/' ],
70+
directory: testDirname,
71+
subdirectories: null,
72+
sources: [
73+
{ selector: 'img', attr: 'src' },
74+
{ selector: 'script', attr: 'src' },
75+
{ selector: 'a', attr: 'href' }
76+
],
77+
maxRecursiveDepth: 2
78+
};
79+
80+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
81+
82+
nock('http://example.com/').get('/depth1.html').replyWithFile(200, mockDirname + '/depth1.html');
83+
nock('http://example.com/').get('/img-depth1.jpg').reply(200, 'img-depth1.jpg');
84+
nock('http://example.com/').get('/script-depth1.js').reply(200, 'script-depth1.js');
85+
86+
nock('http://example.com/').get('/depth2.html').replyWithFile(200, mockDirname + '/depth2.html');
87+
nock('http://example.com/').get('/img-depth2.jpg').reply(200, 'img-depth2.jpg');
88+
nock('http://example.com/').get('/script-depth2.js').reply(200, 'script-depth2.js');
89+
90+
nock('http://example.com/').get('/depth3.html').reply(200, 'OK');
91+
nock('http://example.com/').get('/img-depth3.jpg').reply(200, 'img-depth3.jpg');
92+
nock('http://example.com/').get('/script-depth3.js').reply(200, 'script-depth3.js');
93+
94+
return scrape(options).then(() => {
95+
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
96+
97+
fs.existsSync(testDirname + '/depth1.html').should.be.eql(true);
98+
fs.existsSync(testDirname + '/img-depth1.jpg').should.be.eql(true);
99+
fs.existsSync(testDirname + '/script-depth1.js').should.be.eql(true);
100+
101+
fs.existsSync(testDirname + '/depth2.html').should.be.eql(true);
102+
fs.existsSync(testDirname + '/img-depth2.jpg').should.be.eql(true);
103+
fs.existsSync(testDirname + '/script-depth2.js').should.be.eql(true);
104+
105+
fs.existsSync(testDirname + '/depth3.html').should.be.eql(false);
106+
// img-depth3.jpg and script-depth3.js - dependencies of depth2.html
107+
// they should be loaded because maxRecursiveDepth applies only to <a href=''>
108+
fs.existsSync(testDirname + '/img-depth3.jpg').should.be.eql(true);
109+
fs.existsSync(testDirname + '/script-depth3.js').should.be.eql(true);
110+
});
111+
});
112+
113+
});
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Title</title>
6+
<script src="script-depth2.js"></script>
7+
</head>
8+
<body>
9+
<a href="/depth2.html"></a>
10+
<img src="img-depth2.jpg"/>
11+
</body>
12+
</html>

0 commit comments

Comments
 (0)