Skip to content

Commit 52fc13e

Browse files
authored
Add custom httpResponseHandler (#188)
* Add custom httpResponseHandler * Add tests * Add httpResponseHandler to default options * Reorder options description in readme * Add info about httpResponseHandler to readme
1 parent c2fb99d commit 52fc13e

File tree

8 files changed

+336
-141
lines changed

8 files changed

+336
-141
lines changed

README.md

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,17 +49,18 @@ Makes requests to `urls` and saves all files found with `sources` to `directory`
4949
**options** - object containing next options:
5050

5151
- `urls`: array of urls to load and filenames for them *(required, see example below)*
52-
- `urlFilter`: function which is called for each url to check whether it should be scraped. *(optional, see example below)*
5352
- `directory`: path to save loaded files *(required)*
54-
- `filenameGenerator`: name of one of the bundled filenameGenerators, or a custom filenameGenerator function *(optional, default: 'byType')*
55-
- `defaultFilename`: filename for index page *(optional, default: 'index.html')*
56-
- `prettifyUrls`: whether urls should be 'prettified', by having the `defaultFilename` removed *(optional, default: false)*
5753
- `sources`: array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see example below)*
58-
- `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
59-
- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)*
6054
- `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)*
6155
- `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)*
56+
- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)*
57+
- `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
58+
- `defaultFilename`: filename for index page *(optional, default: 'index.html')*
59+
- `prettifyUrls`: whether urls should be 'prettified', by having the `defaultFilename` removed *(optional, default: false)*
6260
- `ignoreErrors`: boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error *(optional, default: true)*
61+
- `urlFilter`: function which is called for each url to check whether it should be scraped. *(optional, see example below)*
62+
- `filenameGenerator`: name of one of the bundled filenameGenerators, or a custom filenameGenerator function *(optional, default: 'byType')*
63+
- `httpResponseHandler`: function which is called on each response, allows to customize resource or reject its downloading *(optional, see example below)*
6364

6465
Default options you can find in [lib/config/defaults.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/config/defaults.js).
6566

@@ -85,6 +86,14 @@ When the `bySiteStructure` filenameGenerator is used the downloaded files are sa
8586
- `/about` => `DIRECTORY/about/index.html`
8687
- `/resources/javascript/libraries/jquery.min.js` => `DIRECTORY/resources/javascript/libraries/jquery.min.js`
8788

89+
### Http Response Handlers
90+
HttpResponseHandler is used to reject resource downloading or customize resource text based on response data (for example, status code, content type, etc.)
91+
Function takes `response` argument - response object of [request](https://github.com/request/request) module and should return resolved `Promise` if resource should be downloaded or rejected with Error `Promise` if it should be skipped.
92+
Promise should be resolved with:
93+
* `string` which contains response body
94+
* or object with properies `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.
95+
96+
See [example of using httpResponseHandler](#example-5-rejecting-resources-with-404-status-and-adding-metadata).
8897

8998
## Examples
9099
#### Example 1
@@ -176,6 +185,29 @@ scrape({
176185
}).then(console.log).catch(console.log);
177186
```
178187

188+
#### Example 5. Rejecting resources with 404 status and adding metadata
189+
```javascript
190+
var scrape = require('website-scraper');
191+
scrape({
192+
urls: ['http://example.com/'],
193+
directory: '/path/to/save',
194+
httpResponseHandler: (response) => {
195+
if (response.statusCode === 404) {
196+
return Promise.reject(new Error('status is 404'));
197+
} else {
198+
// if you don't need metadata - you can just return Promise.resolve(response.body)
199+
return Promise.resolve({
200+
body: response.body,
201+
metadata: {
202+
headers: response.headers,
203+
someOtherData: [ 1, 2, 3 ]
204+
}
205+
});
206+
}
207+
}
208+
}).then(console.log).catch(console.log);
209+
```
210+
179211
## Log and debug
180212
This module uses [debug](https://github.com/visionmedia/debug) to log events. To enable logs you should use environment variable `DEBUG`.
181213
Next command will log everything from website-scraper

lib/config/defaults.js

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,11 @@ var config = {
3030
jar: true,
3131
gzip: true
3232
},
33-
urlFilter: function urlFilter () {
34-
return true;
35-
},
33+
urlFilter: null,
3634
recursive: false,
3735
maxDepth: null,
38-
ignoreErrors: true
36+
ignoreErrors: true,
37+
httpResponseHandler: null
3938
};
4039

4140
module.exports = config;

lib/request.js

Lines changed: 68 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,79 @@
1-
var _ = require('lodash');
2-
var Promise = require('bluebird');
3-
var request = require('request');
4-
var get = Promise.promisify(request.get);
5-
var logger = require('./logger');
1+
'use strict';
2+
3+
const _ = require('lodash');
4+
const Promise = require('bluebird');
5+
const request = require('request');
6+
const get = Promise.promisify(request.get);
7+
const logger = require('./logger');
68

79
function getMimeType (contentType) {
810
return contentType ? contentType.split(';')[0] : null;
911
}
1012

11-
function makeRequest (options, url, referer) {
12-
var requestOptions = _.clone(options);
13-
requestOptions.url = url;
13+
function defaultResponseHandler (response) {
14+
return Promise.resolve(response.body);
15+
}
16+
17+
function transformResult (result) {
18+
switch (true) {
19+
case _.isString(result):
20+
return {
21+
body: result,
22+
metadata: null
23+
};
24+
case _.isPlainObject(result):
25+
return {
26+
body: result.body,
27+
metadata: result.metadata || null
28+
};
29+
default:
30+
throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result);
31+
}
32+
}
1433

15-
if (referer) {
16-
requestOptions.headers = requestOptions.headers || {};
17-
requestOptions.headers.referer = referer;
34+
class Request {
35+
/**
36+
*
37+
* @param {Object} options
38+
* @param {function} options.httpResponseHandler - custom response handler
39+
* @param {Object} options.request - custom options for request module
40+
*/
41+
constructor (options) {
42+
this.handleResponse = options && options.httpResponseHandler ? options.httpResponseHandler : defaultResponseHandler;
43+
this.options = options && options.request ? _.clone(options.request) : {};
1844
}
1945

20-
logger.debug(`[request] sending request for url ${url}, referer ${referer}`);
46+
/**
47+
* Performs get request to url and returns data for resource
48+
* @param {string} url - url of resource
49+
* @param {string} referer - url of parent resource
50+
* @return {Promise}
51+
*/
52+
get (url, referer) {
53+
let requestOptions = _.clone(this.options);
54+
requestOptions.url = url;
55+
56+
if (referer) {
57+
requestOptions.headers = requestOptions.headers || {};
58+
requestOptions.headers.referer = referer;
59+
}
2160

22-
return get(requestOptions).then(function handleResponse (data) {
23-
logger.debug(`[request] received response for ${data.request.href}, statusCode ${data.statusCode}`);
24-
return {
25-
url: data.request.href,
26-
mimeType: getMimeType(data.headers['content-type']),
27-
body: data.body
28-
};
29-
});
61+
logger.debug(`[request] sending request for url ${url}, referer ${referer}`);
62+
63+
return get(requestOptions).then((response) => {
64+
logger.debug(`[request] received response for ${response.request.href}, statusCode ${response.statusCode}`);
65+
return this.handleResponse(response)
66+
.then(transformResult)
67+
.then((responseHandlerResult) => {
68+
return {
69+
url: response.request.href,
70+
mimeType: getMimeType(response.headers['content-type']),
71+
body: responseHandlerResult.body,
72+
metadata: responseHandlerResult.metadata
73+
};
74+
});
75+
});
76+
}
3077
}
3178

32-
module.exports = makeRequest;
79+
module.exports = Request;

lib/resource.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,8 @@ Resource.prototype.setSaved = function setSaved () {
8888
this.saved = true;
8989
};
9090

91+
Resource.prototype.setMetadata = function setMetadata (metadata) {
92+
this.metadata = metadata;
93+
};
94+
9195
module.exports = Resource;

lib/scraper.js

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ var recursiveSources = require('./config/recursive-sources');
88
var Resource = require('./resource');
99

1010
var FilenameGenerator = require('./filename-generator');
11-
var makeRequest = require('./request');
11+
var Request = require('./request');
1212
var ResourceHandler = require('./resource-handler');
1313
var FSAdapter = require('./fs-adaper');
1414
var utils = require('./utils');
@@ -28,7 +28,7 @@ function Scraper (options) {
2828

2929
logger.info('init with options', self.options);
3030

31-
self.makeRequest = makeRequest.bind(null, self.options.request);
31+
self.request = new Request(self.options);
3232
self.resourceHandler = new ResourceHandler(self.options, self);
3333
self.filenameGenerator = new FilenameGenerator(self.options);
3434
self.fsAdapter = new FSAdapter(self.options);
@@ -80,7 +80,7 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) {
8080
var requestPromise = Promise.resolve()
8181
.then(function makeRequest () {
8282
var referer = resource.parent ? resource.parent.getUrl() : null;
83-
return self.makeRequest(url, referer);
83+
return self.request.get(url, referer);
8484
}).then(function requestCompleted (responseData) {
8585

8686
if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects
@@ -104,6 +104,10 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) {
104104
resource.setType(utils.getTypeByFilename(filename));
105105
}
106106

107+
if (responseData.metadata) {
108+
resource.setMetadata(responseData.metadata);
109+
}
110+
107111
resource.setText(responseData.body);
108112
self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad
109113
return resource;
@@ -120,7 +124,7 @@ Scraper.prototype.requestResource = function requestResource (resource) {
120124
var self = this;
121125
var url = resource.getUrl();
122126

123-
if (!self.options.urlFilter(url)) {
127+
if (self.options.urlFilter && !self.options.urlFilter(url)) {
124128
logger.debug('filtering out ' + resource + ' by url filter');
125129
return Promise.resolve(null);
126130
}

0 commit comments

Comments
 (0)