Skip to content

Commit 589deb6

Browse files
committed
Fix error handling (#151)
* Fix error handling, WIP * WIP * WIP * add Resource method isSaved, setSaved, add test * Add error handling test for handleResource error * Return existing resource for updated url, add test for redirects and same resource from different urls * Fix failing loadResource tests * Rename scraper methods according to changes * Add es6 to eslint config
1 parent c968310 commit 589deb6

File tree

8 files changed

+238
-62
lines changed

8 files changed

+238
-62
lines changed

lib/fs-adaper.js

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ var outputFileAsync = Promise.promisify(fs.outputFile);
88
var ensureDirAsync = Promise.promisify(fs.ensureDir);
99
var removeAsync = Promise.promisify(fs.remove);
1010

11-
var logger = require('./logger');
1211

1312
function FSAdapter (options) {
1413
var self = this;
@@ -51,7 +50,6 @@ FSAdapter.prototype.saveResource = function saveResource (resource) {
5150

5251
var filename = path.join(self.options.absoluteDirectoryPath, resource.getFilename());
5352
var text = resource.getText();
54-
logger.info('saving resource ' + resource + ' to fs');
5553
return outputFileAsync(filename, text, { encoding: 'binary' }).then(function resourceSaved () {
5654
self.loadedResources.push(resource);
5755
});

lib/resource.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,4 +126,12 @@ Resource.prototype.toString = function toString () {
126126
return '{ url: "' + this.getUrl() + '", filename: "' + this.getFilename() + '", depth: ' + this.getDepth() + ' }';
127127
};
128128

129+
Resource.prototype.isSaved = function isSaved () {
130+
return this.saved || 0;
131+
};
132+
133+
Resource.prototype.setSaved = function setSaved () {
134+
this.saved = 1;
135+
};
136+
129137
module.exports = Resource;

lib/scraper.js

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ function Scraper (options) {
4040

4141
self.occupiedFileNames = []; // Array of unavailable filenames
4242
self.respondedResourcePromises = {}; // Map url -> request promise
43-
self.loadedResourcePromises = {}; // Map url -> save-to-fs promise
43+
self.loadedResources = {}; // Map url -> resource
4444
}
4545

4646
Scraper.prototype.addOccupiedFileName = function addOccupiedFileName (filename) {
@@ -59,12 +59,12 @@ Scraper.prototype.getRespondedResourcePromise = function getRespondedResourcePro
5959
return this.respondedResourcePromises[utils.normalizeUrl(url)];
6060
};
6161

62-
Scraper.prototype.addLoadedResourcePromise = function addLoadedResourcePromise (url, promise) {
63-
this.loadedResourcePromises[utils.normalizeUrl(url)] = promise;
62+
Scraper.prototype.addLoadedResource = function addLoadedResourcePromise (url, promise) {
63+
this.loadedResources[utils.normalizeUrl(url)] = promise;
6464
};
6565

66-
Scraper.prototype.getLoadedResourcePromise = function getLoadedResourcePromise (url) {
67-
return this.loadedResourcePromises[utils.normalizeUrl(url)];
66+
Scraper.prototype.getLoadedResource = function getLoadedResourcePromise (url) {
67+
return this.loadedResources[utils.normalizeUrl(url)];
6868
};
6969

7070
Scraper.prototype.getHtmlSources = function getHtmlSources () {
@@ -75,34 +75,38 @@ Scraper.prototype.loadResource = function loadResource (resource) {
7575
var self = this;
7676
var url = resource.getUrl();
7777

78-
var loadedResourcePromise = self.getLoadedResourcePromise(url);
79-
if (loadedResourcePromise) {
78+
var loadedResource = self.getLoadedResource(url);
79+
if (loadedResource) {
8080
logger.debug('found loaded resource for ' + resource);
8181
} else {
8282
logger.debug('start loading resource ' + resource);
83-
loadedResourcePromise = Promise.resolve()
84-
.then(function handleResource () {
85-
var resourceHandler = self.getResourceHandler(resource);
86-
return resourceHandler(self, resource);
87-
}).then(function fileHandled () {
88-
return self.fsAdapter.saveResource(resource);
89-
}).catch(function handleError (err) {
90-
logger.warn('failed to save resource ' + resource);
91-
return self.handleError(err);
92-
});
93-
94-
self.addLoadedResourcePromise(url, loadedResourcePromise);
83+
self.addLoadedResource(url, resource);
9584
}
85+
};
9686

97-
return loadedResourcePromise;
87+
Scraper.prototype.saveResource = function saveResource (resource) {
88+
var self = this;
89+
resource.setSaved();
90+
91+
return Promise.resolve()
92+
.then(function handleResource () {
93+
var resourceHandler = self.getResourceHandler(resource);
94+
return resourceHandler(self, resource);
95+
}).then(function fileHandled () {
96+
logger.info('saving resource ' + resource + ' to fs');
97+
return self.fsAdapter.saveResource(resource);
98+
}).catch(function handleError (err) {
99+
logger.warn('failed to save resource ' + resource);
100+
return self.handleError(err);
101+
});
98102
};
99103

100104
Scraper.prototype.requestResource = function requestResource (resource) {
101105
var self = this;
102106
var url = resource.getUrl();
103107

104108
if (!self.options.urlFilter(url)) {
105-
logger.info('filtering out ' + resource + ' by url filter');
109+
logger.debug('filtering out ' + resource + ' by url filter');
106110
return Promise.resolve(null);
107111
}
108112

@@ -121,6 +125,10 @@ Scraper.prototype.requestResource = function requestResource (resource) {
121125

122126
if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects
123127
logger.debug('url changed. old url = ' + url + ', new ulr = ' + responseData.url);
128+
var respondedNewUrlResource = self.getRespondedResourcePromise(responseData.url);
129+
if (respondedNewUrlResource) {
130+
return respondedNewUrlResource;
131+
}
124132
resource.setUrl(responseData.url);
125133
self.addRespondedResourcePromise(responseData.url, respondedResourcePromise);
126134
}
@@ -155,23 +163,21 @@ Scraper.prototype.load = function load () {
155163
// Do not wait for loadResource here, to prevent deadlock, scraper.waitForLoad
156164
self.loadResource(resOriginalResource);
157165
});
158-
}).then(function afterAllOriginalResourcesResponded () {
159-
return self.waitForLoad(0);
160-
});
166+
}).then(self.waitForLoad.bind(self));
161167
});
162168
};
163169

164170
// Returns a promise which gets resolved when all resources are loaded.
165-
// Determines whether loading is done by:
166-
// 1. Waiting until all loadedResourcePromises are resolved.
167-
// 2. Recursing itself if any new loadedResourcePromises (promises for the loading of childResources) where added during this time. If not, loading is done.
168-
Scraper.prototype.waitForLoad = function waitForLoad (previousCount) {
171+
// 1. Get all not saved resources and save them
172+
// 2. Recursion if any new not saved resource were added during this time. If not, loading is done.
173+
Scraper.prototype.waitForLoad = function waitForLoad () {
169174
var self = this;
170-
var count = _.size(self.loadedResourcePromises);
171-
var loadingIsFinished = (count === previousCount);
175+
var resourcesToSave = _.filter(self.loadedResources, function getNotSaved (r) { return !r.isSaved(); });
176+
var loadingIsFinished = _.isEmpty(resourcesToSave);
172177

173178
if (!loadingIsFinished) {
174-
return Promise.all(_.toArray(self.loadedResourcePromises)).then(self.waitForLoad.bind(self, count));
179+
return Promise.mapSeries(resourcesToSave, self.saveResource.bind(self))
180+
.then(self.waitForLoad.bind(self));
175181
}
176182
return Promise.resolve(self.originalResources);
177183
};

test/functional/error-handling.js

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
var should = require('should');
2+
var nock = require('nock');
3+
var sinon = require('sinon');
4+
require('sinon-as-promised');
5+
var fs = require('fs-extra');
6+
var Promise = require('bluebird');
7+
var Scraper = require('../../lib/scraper');
8+
9+
var testDirname = __dirname + '/.error-handling';
10+
var mockDirname = __dirname + '/mocks/error-handling';
11+
var scraper;
12+
13+
describe('Functional error handling', function() {
14+
15+
beforeEach(function () {
16+
nock.cleanAll();
17+
nock.disableNetConnect();
18+
19+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
20+
nock('http://example.com/').get('/page1.html').delay(100).reply(200, 'ok');
21+
nock('http://example.com/').get('/page2.html').delay(200).reply(200, 'ok');
22+
nock('http://example.com/').get('/page3.html').delay(300).reply(200, 'ok');
23+
nock('http://example.com/').get('/page4.html').delay(400).reply(200, 'ok');
24+
nock('http://example.com/').get('/page5.html').delay(500).reply(200, 'ok');
25+
nock('http://example.com/').get('/page6.html').delay(600).reply(200, 'ok');
26+
27+
var options = {
28+
urls: [ 'http://example.com/' ],
29+
directory: testDirname,
30+
subdirectories: null,
31+
recursive: true,
32+
maxDepth: 2,
33+
sources: []
34+
};
35+
scraper = new Scraper(options);
36+
});
37+
38+
afterEach(function () {
39+
nock.cleanAll();
40+
nock.enableNetConnect();
41+
fs.removeSync(testDirname);
42+
});
43+
44+
describe('FS Error', function () {
45+
var loadToFsStub;
46+
47+
beforeEach(function() {
48+
scraper.fsAdapter.loadedResources = [1, 2];
49+
loadToFsStub = sinon.stub(scraper.fsAdapter, 'saveResource').resolves();
50+
loadToFsStub.onCall(2).rejects(new Error('FS FAILED!'));
51+
});
52+
53+
it('should remove directory and immediately reject on fs error if ignoreErrors is false', function () {
54+
scraper.options.ignoreErrors = false;
55+
56+
return scraper.scrape().then(function() {
57+
should(true).be.eql(false);
58+
}).catch(function (err) {
59+
fs.existsSync(testDirname).should.be.eql(false);
60+
should(err.message).be.eql('FS FAILED!');
61+
should(loadToFsStub.callCount).be.eql(3);
62+
});
63+
});
64+
65+
it('should ignore fs error if ignoreErrors is true', function () {
66+
scraper.options.ignoreErrors = true;
67+
68+
return scraper.scrape().then(function() {
69+
should(loadToFsStub.callCount).be.eql(7);
70+
fs.existsSync(testDirname).should.be.eql(true);
71+
});
72+
});
73+
});
74+
});
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<a href="page1.html">1</a>
9+
<a href="page2.html">2</a>
10+
<a href="page3.html">3</a>
11+
<a href="page4.html">4</a>
12+
<a href="page5.html">5</a>
13+
<a href="page6.html">6</a>
14+
</body>
15+
</html>
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
<a href="/true-page.html">1</a>
9+
<a href="duplicating-page.html">2</a>
10+
<a href="http://duplicating.another-site.com/">3</a>
11+
</body>
12+
</html>

test/functional/redirects.js

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
require('should');
2+
var nock = require('nock');
3+
var fs = require('fs-extra');
4+
var sinon = require('sinon');
5+
var Scraper = require('../../lib/scraper');
6+
7+
var testDirname = __dirname + '/.redirects';
8+
var mockDirname = __dirname + '/mocks/redirects';
9+
10+
describe('Functional redirects', function() {
11+
12+
beforeEach(function() {
13+
nock.cleanAll();
14+
nock.disableNetConnect();
15+
});
16+
17+
afterEach(function() {
18+
nock.cleanAll();
19+
nock.enableNetConnect();
20+
fs.removeSync(testDirname);
21+
});
22+
23+
it('should follow redirects and save resource once if it has different urls', function() {
24+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
25+
// true page - ok
26+
nock('http://example.com/').get('/true-page.html').reply(200, 'true page 1');
27+
// duplicating page - redirect to true page
28+
nock('http://example.com/').get('/duplicating-page.html').reply(302, '', {'Location': 'http://example.com/true-page.html'});
29+
nock('http://example.com/').get('/true-page.html').reply(200, 'true page 2');
30+
// duplicating site - redirect to duplicating page, then redirect to true page
31+
nock('http://duplicating.another-site.com/').get('/').reply(302, '', {'Location': 'http://example.com/duplicating-page.html'});
32+
nock('http://example.com/').get('/duplicating-page.html').reply(302, '', {'Location': 'http://example.com/true-page.html'});
33+
nock('http://example.com/').get('/true-page.html').reply(200, 'true page 3');
34+
35+
var options = {
36+
urls: [ 'http://example.com/' ],
37+
directory: testDirname,
38+
subdirectories: null,
39+
recursive: true,
40+
maxDepth: 2,
41+
sources: []
42+
};
43+
var scraper = new Scraper(options);
44+
var loadToFsSpy = sinon.spy(scraper.fsAdapter, 'saveResource');
45+
46+
return scraper.scrape(options).then(function() {
47+
loadToFsSpy.callCount.should.be.eql(2);
48+
loadToFsSpy.args[0][0].filename.should.be.eql('index.html');
49+
loadToFsSpy.args[1][0].filename.should.be.eql('true-page.html');
50+
51+
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
52+
fs.existsSync(testDirname + '/true-page.html').should.be.eql(true);
53+
54+
// should update all urls to true-page.html
55+
fs.readFileSync(testDirname + '/index.html').toString().should.containEql('<a href="true-page.html">1</a>');
56+
fs.readFileSync(testDirname + '/index.html').toString().should.containEql('<a href="true-page.html">2</a>');
57+
fs.readFileSync(testDirname + '/index.html').toString().should.containEql('<a href="true-page.html">3</a>');
58+
59+
// true-page.html should have body from 1st response
60+
fs.readFileSync(testDirname + '/true-page.html').toString().should.be.eql('true page 1');
61+
});
62+
});
63+
});

0 commit comments

Comments
 (0)