Use ignoreErrors=false by default (#332)

s0ph1e · web-flow · commit 5f4bb623154d · 2018-11-29T11:47:02.000+02:00
diff --git a/MIGRATION.md b/MIGRATION.md
@@ -107,6 +107,7 @@ scrape({
 		return Promise.reject(new Error('status is 404'));
 	} else {
 		return Promise.resolve(response.body);
+  	}
   }
 })
 
@@ -115,12 +116,11 @@ class MyAfterResponsePlugin {
   apply(registerAction) {
     registerAction('afterResponse', ({response}) => {
     if (response.statusCode === 404) {
-        return Promise.reject(new Error('status is 404'));
+        return null;
       } else {
-        return Promise.resolve(response.body);
-      });
-    }
-  });
+        return response.body;
+      }
+    });
   }
 }
 scrape({
diff --git a/README.md b/README.md
@@ -163,7 +163,7 @@ String, filename for index page. Defaults to `index.html`.
 Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`.
 
 #### ignoreErrors
-Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `true`.
+Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `false`.
 
 #### urlFilter
 Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied.
@@ -300,16 +300,16 @@ If multiple actions `afterResponse` added - scraper will use result from last on
 // Do not save resources which responded with 404 not found status code
 registerAction('afterResponse', ({response}) => {
 	if (response.statusCode === 404) {
-			return Promise.reject(new Error('status is 404'));
-		} else {
-			// if you don't need metadata - you can just return Promise.resolve(response.body)
-			return Promise.resolve({
-				body: response.body,
-				metadata: {
-					headers: response.headers,
-					someOtherData: [ 1, 2, 3 ]
+			return null;
+	} else {
+		// if you don't need metadata - you can just return Promise.resolve(response.body)
+		return {
+			body: response.body,
+			metadata: {
+				headers: response.headers,
+				someOtherData: [ 1, 2, 3 ]
 			}
-		});
+		}
 	}
 });
 ```
diff --git a/lib/config/defaults.js b/lib/config/defaults.js
@@ -52,7 +52,7 @@ const config = {
 	recursive: false,
 	maxRecursiveDepth: null,
 	maxDepth: null,
-	ignoreErrors: true
+	ignoreErrors: false
 };
 
 module.exports = config;
diff --git a/lib/request.js b/lib/request.js
@@ -24,6 +24,8 @@ function transformResult (result) {
 				body: result.body,
 				metadata: result.metadata || null
 			};
+		case result === null:
+			return null;
 		default:
 			throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result);
 	}
@@ -44,6 +46,9 @@ module.exports.get = ({url, referer, options = {}, afterResponse = defaultRespon
 		return afterResponse({response})
 			.then(transformResult)
 			.then((responseHandlerResult) => {
+				if (!responseHandlerResult) {
+					return null;
+				}
 				return {
 					url: response.request.href,
 					mimeType: getMimeType(response.headers['content-type']),
diff --git a/lib/scraper.js b/lib/scraper.js
@@ -157,6 +157,10 @@ class Scraper {
 					afterResponse: this.actions.afterResponse.length ? this.runActions.bind(this, 'afterResponse') : undefined
 				}));
 			}).then(async function requestCompleted (responseData) {
+				if (!responseData) {
+					logger.debug('no response returned for url ' + url);
+					return null;
+				}
 
 				if (!urlsEqual(responseData.url, url)) { // Url may be changed in redirects
 					logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url);
diff --git a/test/functional/callbacks/callbacks.test.js b/test/functional/callbacks/callbacks.test.js
@@ -20,7 +20,7 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin',
 		fs.removeSync(testDirname);
 	});
 
-	it('should call onResourceSaved callback and onResourceError callback', function() {
+	it('should call onResourceSaved callback and onResourceError callback if ignoreErrors = true', function() {
 		nock('http://example.com/').get('/').reply(200, 'OK');
 		nock('http://nodejs.org/').get('/').replyWithError('REQUEST ERROR!!');
 
@@ -38,6 +38,7 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin',
 			urls: [ 'http://example.com/', 'http://nodejs.org/' ],
 			directory: testDirname,
 			subdirectories: null,
+			ignoreErrors: true,
 			plugins: [
 				new MyPlugin()
 			]
@@ -52,4 +53,38 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin',
 			should(resourceErrorStub.args[0][0].error.message).be.eql('REQUEST ERROR!!');
 		});
 	});
+
+	it('should call onResourceError callback if ignoreErrors = false', function() {
+		// it is not necessary that 1st (successful) resource will be saved before error occurred, so skip onResourceSaved check
+		nock('http://example.com/').get('/').reply(200, 'OK');
+		nock('http://nodejs.org/').get('/').replyWithError('REQUEST ERROR!!');
+
+		const resourceSavedStub = sinon.stub();
+		const resourceErrorStub = sinon.stub();
+
+		class MyPlugin {
+			apply(addAction) {
+				addAction('onResourceSaved', resourceSavedStub);
+				addAction('onResourceError', resourceErrorStub);
+			}
+		}
+
+		const options = {
+			urls: [ 'http://example.com/', 'http://nodejs.org/' ],
+			directory: testDirname,
+			subdirectories: null,
+			ignoreErrors: true,
+			plugins: [
+				new MyPlugin()
+			]
+		};
+
+		return scrape(options).then(function() {
+			should(true).eql(false);
+		}).catch(() => {
+			should(resourceErrorStub.calledOnce).be.eql(true);
+			should(resourceErrorStub.args[0][0].resource.url).be.eql('http://nodejs.org/');
+			should(resourceErrorStub.args[0][0].error.message).be.eql('REQUEST ERROR!!');
+		});
+	});
 });
diff --git a/test/functional/request-response-customizations/after-response-action.test.js b/test/functional/request-response-customizations/after-response-action.test.js
@@ -0,0 +1,66 @@
+const should = require('should');
+const nock = require('nock');
+const fs = require('fs-extra');
+const scrape = require('../../../index');
+
+const testDirname = __dirname + '/.tmp';
+const mockDirname = __dirname + '/mocks';
+
+describe('Functional: afterResponse action in plugin', function() {
+
+	beforeEach(function() {
+		nock.cleanAll();
+		nock.disableNetConnect();
+	});
+
+	afterEach(function() {
+		nock.cleanAll();
+		nock.enableNetConnect();
+		fs.removeSync(testDirname);
+	});
+
+	it('should skip downloading resource if afterResponse returns null', function() {
+		nock('http://example.com/').get('/1.html').reply(200, 'content of 1.html');
+		nock('http://example.com/').get('/2.html').reply(404);
+
+		class Skip404ResponseHandler {
+			apply(add) {
+				add('afterResponse', ({response}) => {
+					if (response.statusCode === 404) {
+						return null;
+					} else {
+						return {
+							body: response.body,
+							metadata: {
+								headers: response.headers,
+								someOtherData: [ 1, 2, 3 ]
+							}
+						}
+					}
+				});
+			}
+		}
+
+		const options = {
+			urls: [
+				{ url: 'http://example.com/1.html', filename: '1.html' },
+				{ url: 'http://example.com/2.html', filename: '2.html' }
+			],
+			directory: testDirname,
+			plugins: [
+				new Skip404ResponseHandler()
+			]
+		};
+
+		return scrape(options).then(function(result) {
+			should(result[0]).have.properties({ url: 'http://example.com/1.html', filename: '1.html', saved: true });
+			should(result[1]).have.properties({ url: 'http://example.com/2.html', filename: '2.html', saved: false });
+
+			fs.existsSync(testDirname + '/1.html').should.be.eql(true);
+			const indexHtml = fs.readFileSync(testDirname + '/1.html').toString();
+			should(indexHtml).containEql('content of 1.html');
+
+			fs.existsSync(testDirname + '/2.html').should.be.eql(false);
+		});
+	});
+});
diff --git a/test/functional/request-response-customizations/request.test.js b/test/functional/request-response-customizations/request.test.js
diff --git a/test/functional/update-missing-sources/update-missing-sources.test.js b/test/functional/update-missing-sources/update-missing-sources.test.js
@@ -64,7 +64,8 @@ describe('Functional: update missing sources', () => {
 			directory: testDirname,
 			subdirectories: null,
 			sources: [{ selector: 'img', attr: 'src' }],
-			plugins: [ new UpdateMissingResourceReferencePlugin() ]
+			plugins: [ new UpdateMissingResourceReferencePlugin() ],
+			ignoreErrors: true
 		};
 
 		nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
@@ -138,7 +139,8 @@ describe('Functional: update missing sources', () => {
 			directory: testDirname,
 			subdirectories: null,
 			sources: [{selector: 'style'}],
-			plugins: [ new UpdateMissingResourceReferencePlugin() ]
+			plugins: [ new UpdateMissingResourceReferencePlugin() ],
+			ignoreErrors: true
 		};
 
 		nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/path-containers.html');
diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js
@@ -22,28 +22,6 @@ describe('Scraper', function () {
 		fs.removeSync(testDirname);
 	});
 
-	describe('#load', function() {
-		it('should return array of objects with url, filename and children', function() {
-			nock('http://first-url.com').get('/').reply(200, 'OK');
-			nock('http://second-url.com').get('/').reply(500);
-
-			var s = new Scraper({
-				urls: [
-					'http://first-url.com',
-					'http://second-url.com'
-				],
-				directory: testDirname
-			});
-
-			return s.load().then(function(res) {
-				res.should.be.instanceOf(Array);
-				res.should.have.length(2);
-				res[0].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
-				res[1].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
-			});
-		});
-	});
-
 	describe('#errorCleanup', function() {
 		it('should throw error', function() {
 			var s = new Scraper({
@@ -345,6 +323,26 @@ describe('Scraper', function () {
 				err.message.should.be.eql('Awful error');
 			});
 		});
+
+		it('should return array of objects with url, filename and children', function() {
+			nock('http://first-url.com').get('/').reply(200, 'OK');
+			nock('http://second-url.com').get('/').reply(500);
+
+			var s = new Scraper({
+				urls: [
+					'http://first-url.com',
+					'http://second-url.com'
+				],
+				directory: testDirname
+			});
+
+			return s.scrape().then(function(res) {
+				res.should.be.instanceOf(Array);
+				res.should.have.length(2);
+				res[0].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
+				res[1].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
+			});
+		});
 	});
 
 	describe('#runActions', () => {