Skip to content

Commit 5f4bb62

Browse files
authored
Use ignoreErrors=false by default (#332)
1 parent 5c6209d commit 5f4bb62

File tree

10 files changed

+151
-41
lines changed

10 files changed

+151
-41
lines changed

MIGRATION.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ scrape({
107107
return Promise.reject(new Error('status is 404'));
108108
} else {
109109
return Promise.resolve(response.body);
110+
}
110111
}
111112
})
112113

@@ -115,12 +116,11 @@ class MyAfterResponsePlugin {
115116
apply(registerAction) {
116117
registerAction('afterResponse', ({response}) => {
117118
if (response.statusCode === 404) {
118-
return Promise.reject(new Error('status is 404'));
119+
return null;
119120
} else {
120-
return Promise.resolve(response.body);
121-
});
122-
}
123-
});
121+
return response.body;
122+
}
123+
});
124124
}
125125
}
126126
scrape({

README.md

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ String, filename for index page. Defaults to `index.html`.
163163
Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`.
164164

165165
#### ignoreErrors
166-
Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `true`.
166+
Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `false`.
167167

168168
#### urlFilter
169169
Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied.
@@ -300,16 +300,16 @@ If multiple actions `afterResponse` added - scraper will use result from last on
300300
// Do not save resources which responded with 404 not found status code
301301
registerAction('afterResponse', ({response}) => {
302302
if (response.statusCode === 404) {
303-
return Promise.reject(new Error('status is 404'));
304-
} else {
305-
// if you don't need metadata - you can just return Promise.resolve(response.body)
306-
return Promise.resolve({
307-
body: response.body,
308-
metadata: {
309-
headers: response.headers,
310-
someOtherData: [ 1, 2, 3 ]
303+
return null;
304+
} else {
305+
// if you don't need metadata - you can just return Promise.resolve(response.body)
306+
return {
307+
body: response.body,
308+
metadata: {
309+
headers: response.headers,
310+
someOtherData: [ 1, 2, 3 ]
311311
}
312-
});
312+
}
313313
}
314314
});
315315
```

lib/config/defaults.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ const config = {
5252
recursive: false,
5353
maxRecursiveDepth: null,
5454
maxDepth: null,
55-
ignoreErrors: true
55+
ignoreErrors: false
5656
};
5757

5858
module.exports = config;

lib/request.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ function transformResult (result) {
2424
body: result.body,
2525
metadata: result.metadata || null
2626
};
27+
case result === null:
28+
return null;
2729
default:
2830
throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result);
2931
}
@@ -44,6 +46,9 @@ module.exports.get = ({url, referer, options = {}, afterResponse = defaultRespon
4446
return afterResponse({response})
4547
.then(transformResult)
4648
.then((responseHandlerResult) => {
49+
if (!responseHandlerResult) {
50+
return null;
51+
}
4752
return {
4853
url: response.request.href,
4954
mimeType: getMimeType(response.headers['content-type']),

lib/scraper.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ class Scraper {
157157
afterResponse: this.actions.afterResponse.length ? this.runActions.bind(this, 'afterResponse') : undefined
158158
}));
159159
}).then(async function requestCompleted (responseData) {
160+
if (!responseData) {
161+
logger.debug('no response returned for url ' + url);
162+
return null;
163+
}
160164

161165
if (!urlsEqual(responseData.url, url)) { // Url may be changed in redirects
162166
logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url);

test/functional/callbacks/callbacks.test.js

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin',
2020
fs.removeSync(testDirname);
2121
});
2222

23-
it('should call onResourceSaved callback and onResourceError callback', function() {
23+
it('should call onResourceSaved callback and onResourceError callback if ignoreErrors = true', function() {
2424
nock('http://example.com/').get('/').reply(200, 'OK');
2525
nock('http://nodejs.org/').get('/').replyWithError('REQUEST ERROR!!');
2626

@@ -38,6 +38,7 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin',
3838
urls: [ 'http://example.com/', 'http://nodejs.org/' ],
3939
directory: testDirname,
4040
subdirectories: null,
41+
ignoreErrors: true,
4142
plugins: [
4243
new MyPlugin()
4344
]
@@ -52,4 +53,38 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin',
5253
should(resourceErrorStub.args[0][0].error.message).be.eql('REQUEST ERROR!!');
5354
});
5455
});
56+
57+
it('should call onResourceError callback if ignoreErrors = false', function() {
58+
// it is not necessary that 1st (successful) resource will be saved before error occurred, so skip onResourceSaved check
59+
nock('http://example.com/').get('/').reply(200, 'OK');
60+
nock('http://nodejs.org/').get('/').replyWithError('REQUEST ERROR!!');
61+
62+
const resourceSavedStub = sinon.stub();
63+
const resourceErrorStub = sinon.stub();
64+
65+
class MyPlugin {
66+
apply(addAction) {
67+
addAction('onResourceSaved', resourceSavedStub);
68+
addAction('onResourceError', resourceErrorStub);
69+
}
70+
}
71+
72+
const options = {
73+
urls: [ 'http://example.com/', 'http://nodejs.org/' ],
74+
directory: testDirname,
75+
subdirectories: null,
76+
ignoreErrors: true,
77+
plugins: [
78+
new MyPlugin()
79+
]
80+
};
81+
82+
return scrape(options).then(function() {
83+
should(true).eql(false);
84+
}).catch(() => {
85+
should(resourceErrorStub.calledOnce).be.eql(true);
86+
should(resourceErrorStub.args[0][0].resource.url).be.eql('http://nodejs.org/');
87+
should(resourceErrorStub.args[0][0].error.message).be.eql('REQUEST ERROR!!');
88+
});
89+
});
5590
});
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
const should = require('should');
2+
const nock = require('nock');
3+
const fs = require('fs-extra');
4+
const scrape = require('../../../index');
5+
6+
const testDirname = __dirname + '/.tmp';
7+
const mockDirname = __dirname + '/mocks';
8+
9+
describe('Functional: afterResponse action in plugin', function() {
10+
11+
beforeEach(function() {
12+
nock.cleanAll();
13+
nock.disableNetConnect();
14+
});
15+
16+
afterEach(function() {
17+
nock.cleanAll();
18+
nock.enableNetConnect();
19+
fs.removeSync(testDirname);
20+
});
21+
22+
it('should skip downloading resource if afterResponse returns null', function() {
23+
nock('http://example.com/').get('/1.html').reply(200, 'content of 1.html');
24+
nock('http://example.com/').get('/2.html').reply(404);
25+
26+
class Skip404ResponseHandler {
27+
apply(add) {
28+
add('afterResponse', ({response}) => {
29+
if (response.statusCode === 404) {
30+
return null;
31+
} else {
32+
return {
33+
body: response.body,
34+
metadata: {
35+
headers: response.headers,
36+
someOtherData: [ 1, 2, 3 ]
37+
}
38+
}
39+
}
40+
});
41+
}
42+
}
43+
44+
const options = {
45+
urls: [
46+
{ url: 'http://example.com/1.html', filename: '1.html' },
47+
{ url: 'http://example.com/2.html', filename: '2.html' }
48+
],
49+
directory: testDirname,
50+
plugins: [
51+
new Skip404ResponseHandler()
52+
]
53+
};
54+
55+
return scrape(options).then(function(result) {
56+
should(result[0]).have.properties({ url: 'http://example.com/1.html', filename: '1.html', saved: true });
57+
should(result[1]).have.properties({ url: 'http://example.com/2.html', filename: '2.html', saved: false });
58+
59+
fs.existsSync(testDirname + '/1.html').should.be.eql(true);
60+
const indexHtml = fs.readFileSync(testDirname + '/1.html').toString();
61+
should(indexHtml).containEql('content of 1.html');
62+
63+
fs.existsSync(testDirname + '/2.html').should.be.eql(false);
64+
});
65+
});
66+
});

test/functional/update-missing-sources/update-missing-sources.test.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ describe('Functional: update missing sources', () => {
6464
directory: testDirname,
6565
subdirectories: null,
6666
sources: [{ selector: 'img', attr: 'src' }],
67-
plugins: [ new UpdateMissingResourceReferencePlugin() ]
67+
plugins: [ new UpdateMissingResourceReferencePlugin() ],
68+
ignoreErrors: true
6869
};
6970

7071
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
@@ -138,7 +139,8 @@ describe('Functional: update missing sources', () => {
138139
directory: testDirname,
139140
subdirectories: null,
140141
sources: [{selector: 'style'}],
141-
plugins: [ new UpdateMissingResourceReferencePlugin() ]
142+
plugins: [ new UpdateMissingResourceReferencePlugin() ],
143+
ignoreErrors: true
142144
};
143145

144146
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/path-containers.html');

test/unit/scraper-test.js

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -22,28 +22,6 @@ describe('Scraper', function () {
2222
fs.removeSync(testDirname);
2323
});
2424

25-
describe('#load', function() {
26-
it('should return array of objects with url, filename and children', function() {
27-
nock('http://first-url.com').get('/').reply(200, 'OK');
28-
nock('http://second-url.com').get('/').reply(500);
29-
30-
var s = new Scraper({
31-
urls: [
32-
'http://first-url.com',
33-
'http://second-url.com'
34-
],
35-
directory: testDirname
36-
});
37-
38-
return s.load().then(function(res) {
39-
res.should.be.instanceOf(Array);
40-
res.should.have.length(2);
41-
res[0].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
42-
res[1].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
43-
});
44-
});
45-
});
46-
4725
describe('#errorCleanup', function() {
4826
it('should throw error', function() {
4927
var s = new Scraper({
@@ -345,6 +323,26 @@ describe('Scraper', function () {
345323
err.message.should.be.eql('Awful error');
346324
});
347325
});
326+
327+
it('should return array of objects with url, filename and children', function() {
328+
nock('http://first-url.com').get('/').reply(200, 'OK');
329+
nock('http://second-url.com').get('/').reply(500);
330+
331+
var s = new Scraper({
332+
urls: [
333+
'http://first-url.com',
334+
'http://second-url.com'
335+
],
336+
directory: testDirname
337+
});
338+
339+
return s.scrape().then(function(res) {
340+
res.should.be.instanceOf(Array);
341+
res.should.have.length(2);
342+
res[0].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
343+
res[1].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
344+
});
345+
});
348346
});
349347

350348
describe('#runActions', () => {

0 commit comments

Comments
 (0)