Skip to content

Commit bef139f

Browse files
committed
Merge pull request #16 from s0ph1e/recursive-max-depth-options
Add recursive and maxDepth options
2 parents e9d2aa7 + abc5f52 commit bef139f

File tree

12 files changed

+396
-39
lines changed

12 files changed

+396
-39
lines changed

README.md

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ npm install website-scraper
1919

2020
## Usage
2121
```javascript
22-
var scraper = require('website-scraper');
22+
var scraper = require('website-scraper');
2323
var options = {
2424
urls: ['http://nodejs.org/'],
2525
directory: '/path/to/save/',
@@ -38,7 +38,7 @@ scraper.scrape(options).then(function (result) {
3838

3939
## API
4040
### scrape(options, callback)
41-
Makes requests to `urls` and saves all files found with `sources` to `directory`.
41+
Makes requests to `urls` and saves all files found with `sources` to `directory`.
4242

4343
**options** - object containing next options:
4444

@@ -48,31 +48,34 @@ Makes requests to `urls` and saves all files found with `sources` to `directory`
4848
- `sources:` array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see default value in `lib/config/defaults.js`)*
4949
- `subdirectories:` array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
5050
- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)*
51-
52-
51+
- `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)*
52+
- `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)*
53+
54+
5355
**callback** - callback function *(optional)*, includes following parameters:
54-
56+
5557
- `error:` if error - `Error` object, if success - `null`
5658
- `result:` if error - `null`, if success - array if objects containing:
5759
- `url:` url of loaded page
5860
- `filename:` filename where page was saved (relative to `directory`)
5961

6062

6163
## Examples
62-
Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`.
64+
#### Example 1
65+
Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`.
6366
Imagine we want to load:
6467
- [Home page](http://nodejs.org/) to `index.html`
6568
- [About page](http://nodejs.org/about/) to `about.html`
6669
- [Blog](http://blog.nodejs.org/) to `blog.html`
67-
70+
6871
and separate files into directories:
6972

70-
- `img` for .jpg, .png, .svg (full path `/path/to/save/img`)
73+
- `img` for .jpg, .png, .svg (full path `/path/to/save/img`)
7174
- `js` for .js (full path `/path/to/save/js`)
7275
- `css` for .css (full path `/path/to/save/css`)
7376

7477
```javascript
75-
var scraper = require('website-scraper');
78+
var scraper = require('website-scraper');
7679
scraper.scrape({
7780
urls: [
7881
'http://nodejs.org/', // Will be saved with default filename 'index.html'
@@ -101,3 +104,16 @@ scraper.scrape({
101104
console.log(err);
102105
});
103106
```
107+
108+
#### Example 2. Recursive downloading
109+
```javascript
110+
// Links from example.com will be followed
111+
// Links from links will be ignored because theirs depth = 2 is greater than maxDepth
112+
var scraper = require('website-scraper');
113+
scraper.scrape({
114+
urls: ['http://example.com/'],
115+
directory: '/path/to/save',
116+
recursive: true,
117+
maxDepth: 1
118+
}).then(console.log).catch(console.log);
119+
```

lib/config/recursive-sources.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module.exports = [
2+
{ selector: 'a', attr: 'href' }
3+
];

lib/file-handlers/css.js

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
var _ = require('underscore');
22
var Promise = require('bluebird');
33
var getCssUrls = require('css-url-parser');
4-
var Resource = require('../resource');
54
var utils = require('../utils');
65

76
function loadCss (context, resource) {
@@ -12,8 +11,7 @@ function loadCss (context, resource) {
1211

1312
var promises = _.map(cssUrls, function loadResourceFromCssUrl (cssUrl) {
1413
var resourceUrl = utils.getUrl(url, cssUrl);
15-
var cssResource = new Resource(resourceUrl);
16-
cssResource.setParent(resource);
14+
var cssResource = resource.createChild(resourceUrl);
1715

1816
return context.loadResource(cssResource).then(function handleLoadedSource (loadedResource) {
1917
var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());

lib/file-handlers/html.js

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
var cheerio = require('cheerio');
22
var Promise = require('bluebird');
33
var utils = require('../utils');
4-
var Resource = require('../resource');
54

65
function loadHtml (context, resource) {
76
var sources = context.getHtmlSources();
@@ -50,8 +49,7 @@ function loadResources (context, resource, source) {
5049

5150
if (attr) {
5251
var resourceUrl = utils.getUrl(url, attr);
53-
var htmlResource = new Resource(resourceUrl);
54-
htmlResource.setParent(resource);
52+
var htmlResource = resource.createChild(resourceUrl);
5553
htmlResource.setHtmlData({ tagName: el[0].name, attributeName: source.attr });
5654

5755
return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) {

lib/resource.js

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@ function Resource (url, filename) {
1515
this.filename = filename;
1616
}
1717

18+
Resource.prototype.createChild = function createChild (url, filename) {
19+
var child = new Resource(url, filename);
20+
21+
var currentDepth = this.getDepth();
22+
23+
child.setParent(this);
24+
child.setDepth(++currentDepth);
25+
26+
return child;
27+
};
28+
1829
Resource.prototype.getUrl = function getUrl () {
1930
return this.url;
2031
};
@@ -43,6 +54,14 @@ Resource.prototype.setParent = function setParent (parent) {
4354
this.parent = parent;
4455
};
4556

57+
Resource.prototype.getDepth = function getDepth () {
58+
return this.depth || 0;
59+
};
60+
61+
Resource.prototype.setDepth = function setDepth (depth) {
62+
this.depth = depth;
63+
};
64+
4665
/**
4766
*
4867
* @param {Object} data - html element data

lib/scraper.js

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,27 @@
11
var Promise = require('bluebird');
2-
var fs = Promise.promisifyAll(require('fs-extra'));
2+
3+
var fs = require('fs-extra');
4+
var existsAsync = Promise.promisify(fs.stat);
5+
var outputFileAsync = Promise.promisify(fs.outputFile);
6+
var ensureDirAsync = Promise.promisify(fs.ensureDir);
7+
38
var path = require('path');
49
var _ = require('underscore');
510

611
var defaults = require('./config/defaults');
712
var types = require('./config/resource-types');
13+
var recursiveSources = require('./config/recursive-sources');
814
var utils = require('./utils.js');
915
var request = require('./request');
1016
var Resource = require('./resource');
11-
var loadHtml = require('./file-handlers/html');
12-
var loadCss = require('./file-handlers/css');
1317
var compareUrls = require('compare-urls');
1418

15-
function getHandleFunction (resource) {
16-
var type = resource.getType();
17-
switch (type) {
18-
case types.css: return loadCss;
19-
case types.html: return function loadHtmlAndCss (context, po) {
20-
return loadHtml(context, po).then(function (loaded) {
21-
return loadCss(context, loaded);
22-
});
23-
};
24-
default: return _.noop;
25-
}
19+
var loadHtml = require('./file-handlers/html');
20+
var loadCss = require('./file-handlers/css');
21+
function loadHtmlAndCss (context, po) {
22+
return loadHtml(context, po).then(function (loaded) {
23+
return loadCss(context, loaded);
24+
});
2625
}
2726

2827
function Scraper (options) {
@@ -83,6 +82,20 @@ Scraper.prototype.getDirectoryByExtension = function getDirectoryByExtension (ex
8382
.value() || '';
8483
};
8584

85+
Scraper.prototype.getResourceHandler = function getHandler (resource) {
86+
var self = this;
87+
var type = resource.getType();
88+
var depth = resource.getDepth();
89+
var depthGreaterThanMax = self.options.maxDepth && depth >= self.options.maxDepth;
90+
91+
switch (true) {
92+
case depthGreaterThanMax: return _.noop;
93+
case type == types.css: return loadCss;
94+
case type == types.html: return loadHtmlAndCss;
95+
default: return _.noop;
96+
}
97+
};
98+
8699
Scraper.prototype.loadResource = function loadResource (resource) {
87100
var self = this;
88101

@@ -102,12 +115,12 @@ Scraper.prototype.loadResource = function loadResource (resource) {
102115
return self.makeRequest(url).then(function requestCompleted(data) {
103116
resource.setUrl(data.url); // Url may be changed in redirects
104117
resource.setText(data.body);
105-
handleFile = getHandleFunction(resource);
118+
handleFile = self.getResourceHandler(resource);
106119
return handleFile(self, resource);
107120
}).then(function fileHandled() {
108121
var filename = path.join(self.options.directory, resource.getFilename());
109122
var text = resource.getText();
110-
return fs.outputFileAsync(filename, text, { encoding: 'binary' });
123+
return outputFileAsync(filename, text, { encoding: 'binary' });
111124
}).then(function fileSaved() {
112125
return Promise.resolve(resource);
113126
});
@@ -116,15 +129,16 @@ Scraper.prototype.loadResource = function loadResource (resource) {
116129
};
117130

118131
Scraper.prototype.validate = function validate () {
119-
if (fs.existsSync(this.options.directory)) {
120-
return Promise.reject(new Error('Path ' + this.options.directory + ' exists'));
121-
}
122-
return Promise.resolve();
132+
var dir = this.options.directory;
133+
return existsAsync(dir).then(function handleDirectoryExist () {
134+
return Promise.reject(new Error('Path ' + dir + ' exists'));
135+
}, function handleDirectoryNotExist () {
136+
return Promise.resolve();
137+
});
123138
};
124139

125140
Scraper.prototype.prepare = function prepare () {
126141
var self = this;
127-
fs.ensureDirSync(self.options.directory);
128142

129143
// Create makeRequest function with custom request params
130144
self.makeRequest = request.makeRequest.bind(null, self.options.request);
@@ -136,7 +150,12 @@ Scraper.prototype.prepare = function prepare () {
136150
var filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename;
137151
return new Resource(url, filename);
138152
});
139-
return Promise.resolve();
153+
154+
if (self.options.recursive) {
155+
self.options.sources = _.union(self.options.sources, recursiveSources);
156+
}
157+
158+
return ensureDirAsync(self.options.directory);
140159
};
141160

142161
Scraper.prototype.load = function load () {

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@
4444
"istanbul": "^0.4.0",
4545
"mocha": "^2.2.5",
4646
"nock": "^2.9.1",
47+
"proxyquire": "^1.7.3",
4748
"should": "^7.0.2",
48-
"sinon": "^1.15.4"
49+
"sinon": "^1.15.4",
50+
"sinon-as-promised": "^4.0.0"
4951
}
5052
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<a href="/link1.html"></a>
9+
<a href="/link2.html"></a>
10+
<a href="/link3.html"></a>
11+
12+
</body>
13+
</html>
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<a href="/about.html"></a>
9+
10+
</body>
11+
</html>

0 commit comments

Comments
 (0)