Skip to content

Commit 1af3274

Browse files
authored
Refactor code duplicates, call loadResource inside requestResource (#177)
* Refactor code duplicates, call loadResource inside requestResource * Split scraper.requestResource into 2 functions * Fix intendation
1 parent caf894e commit 1af3274

File tree

10 files changed

+109
-157
lines changed

10 files changed

+109
-157
lines changed

.eslintrc.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ parserOptions:
33
ecmaVersion: 6
44
env:
55
node: true
6+
es6: true
67
rules:
78
consistent-return: "error"
89
curly: "error"

lib/filename-generator/by-type.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
var _ = require('lodash');
22
var path = require('path');
3-
var utils = require('../utils.js');
3+
var utils = require('../utils');
44
var typeExtensions = require('../config/resource-ext-by-type');
55

66
module.exports = function generateFilename (resource, options, occupiedFileNames) {

lib/resource-handler/index.js

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,6 @@ ResourceHandler.prototype.handleChildrenResources = function handleChildrenResou
6262
}
6363

6464
pathsToUpdate.push({ oldPath: childPath, newPath: relativePath});
65-
66-
// Do not wait for loadResource here, to prevent deadlock, see scraper.waitForLoad
67-
self.context.loadResource(respondedResource);
6865
}
6966
return null; // Prevent Bluebird warnings
7067
});

lib/scraper.js

Lines changed: 41 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ var makeRequest = require('./request');
1212
var ResourceHandler = require('./resource-handler');
1313
var FSAdapter = require('./fs-adaper');
1414
var utils = require('./utils');
15+
var NormalizedUrlMap = require('./utils/normalized-url-map');
1516

1617
function Scraper (options) {
1718
var self = this;
@@ -40,36 +41,19 @@ function Scraper (options) {
4041
return new Resource(url, filename);
4142
});
4243

43-
self.respondedResourcePromises = {}; // Map url -> request promise
44-
self.loadedResources = {}; // Map url -> resource
44+
self.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise
45+
self.loadedResources = new NormalizedUrlMap(); // Map url -> resource
4546
}
4647

47-
Scraper.prototype.addRespondedResourcePromise = function addRespondedResourcePromise (url, promise) {
48-
this.respondedResourcePromises[utils.normalizeUrl(url)] = promise;
49-
};
50-
51-
Scraper.prototype.getRespondedResourcePromise = function getRespondedResourcePromise (url) {
52-
return this.respondedResourcePromises[utils.normalizeUrl(url)];
53-
};
54-
55-
Scraper.prototype.addLoadedResource = function addLoadedResourcePromise (url, promise) {
56-
this.loadedResources[utils.normalizeUrl(url)] = promise;
57-
};
58-
59-
Scraper.prototype.getLoadedResource = function getLoadedResourcePromise (url) {
60-
return this.loadedResources[utils.normalizeUrl(url)];
61-
};
62-
6348
Scraper.prototype.loadResource = function loadResource (resource) {
6449
var self = this;
6550
var url = resource.getUrl();
6651

67-
var loadedResource = self.getLoadedResource(url);
68-
if (loadedResource) {
52+
if (self.loadedResources.has(url)) {
6953
logger.debug('found loaded resource for ' + resource);
7054
} else {
71-
logger.debug('start loading resource ' + resource);
72-
self.addLoadedResource(url, resource);
55+
logger.debug('add loaded resource ' + resource);
56+
self.loadedResources.set(url, resource);
7357
}
7458
};
7559

@@ -89,63 +73,69 @@ Scraper.prototype.saveResource = function saveResource (resource) {
8973
});
9074
};
9175

92-
Scraper.prototype.requestResource = function requestResource (resource) {
76+
Scraper.prototype.createNewRequest = function createNewRequest (resource) {
9377
var self = this;
9478
var url = resource.getUrl();
9579

96-
if (!self.options.urlFilter(url)) {
97-
logger.debug('filtering out ' + resource + ' by url filter');
98-
return Promise.resolve(null);
99-
}
100-
101-
if (self.options.maxDepth && resource.getDepth() > self.options.maxDepth) {
102-
logger.debug('filtering out ' + resource + ' by depth');
103-
return Promise.resolve(null);
104-
}
105-
106-
var respondedResourcePromise = self.getRespondedResourcePromise(url);
107-
if (respondedResourcePromise) {
108-
logger.debug('found responded resource for ' + resource);
109-
return respondedResourcePromise;
110-
}
111-
112-
respondedResourcePromise = Promise.resolve()
80+
var requestPromise = Promise.resolve()
11381
.then(function makeRequest () {
11482
var referer = resource.parent ? resource.parent.getUrl() : null;
11583
return self.makeRequest(url, referer);
11684
}).then(function requestCompleted (responseData) {
11785

11886
if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects
11987
logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url);
120-
var respondedNewUrlResource = self.getRespondedResourcePromise(responseData.url);
121-
if (respondedNewUrlResource) {
122-
return respondedNewUrlResource;
88+
89+
if (self.requestedResourcePromises.has(responseData.url)) {
90+
return self.requestedResourcePromises.get(responseData.url);
12391
}
92+
12493
resource.setUrl(responseData.url);
125-
self.addRespondedResourcePromise(responseData.url, respondedResourcePromise);
94+
self.requestedResourcePromises.set(responseData.url, requestPromise);
12695
}
12796

12897
resource.setType(utils.getTypeByMime(responseData.mimeType));
12998

13099
var filename = self.filenameGenerator.generateFilename(resource);
131100
resource.setFilename(filename);
132101

133-
// if type was not determined by mime
134-
// we can try to get it from filename after it was generated
102+
// if type was not determined by mime we can try to get it from filename after it was generated
135103
if (!resource.getType()) {
136104
resource.setType(utils.getTypeByFilename(filename));
137105
}
138106

139107
resource.setText(responseData.body);
108+
self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad
140109
return resource;
141110
}).catch(function handleError (err) {
142111
logger.warn('failed to request resource ' + resource);
143112
return self.handleError(err);
144113
});
145114

146-
self.addRespondedResourcePromise(url, respondedResourcePromise);
115+
self.requestedResourcePromises.set(url, requestPromise);
116+
return requestPromise;
117+
};
147118

148-
return respondedResourcePromise;
119+
Scraper.prototype.requestResource = function requestResource (resource) {
120+
var self = this;
121+
var url = resource.getUrl();
122+
123+
if (!self.options.urlFilter(url)) {
124+
logger.debug('filtering out ' + resource + ' by url filter');
125+
return Promise.resolve(null);
126+
}
127+
128+
if (self.options.maxDepth && resource.getDepth() > self.options.maxDepth) {
129+
logger.debug('filtering out ' + resource + ' by depth');
130+
return Promise.resolve(null);
131+
}
132+
133+
if (self.requestedResourcePromises.has(url)) {
134+
logger.debug('found requested resource for ' + resource);
135+
return self.requestedResourcePromises.get(url);
136+
}
137+
138+
return self.createNewRequest(resource);
149139
};
150140

151141
Scraper.prototype.validate = function validate () {
@@ -155,23 +145,16 @@ Scraper.prototype.validate = function validate () {
155145
Scraper.prototype.load = function load () {
156146
var self = this;
157147
return self.fsAdapter.createDirectory().then(function loadAllResources () {
158-
return Promise.map(self.originalResources, function loadResource (originalResource) {
159-
return self.requestResource(originalResource).then(function receivedResponse (resOriginalResource) {
160-
if (resOriginalResource) {
161-
// Do not wait for loadResource here, to prevent deadlock, scraper.waitForLoad
162-
self.loadResource(resOriginalResource);
163-
}
164-
});
165-
}).then(self.waitForLoad.bind(self));
166-
});
148+
return Promise.map(self.originalResources, self.requestResource.bind(self));
149+
}).then(self.waitForLoad.bind(self));
167150
};
168151

169152
// Returns a promise which gets resolved when all resources are loaded.
170153
// 1. Get all not saved resources and save them
171154
// 2. Recursion if any new not saved resource were added during this time. If not, loading is done.
172155
Scraper.prototype.waitForLoad = function waitForLoad () {
173156
var self = this;
174-
var resourcesToSave = _.filter(self.loadedResources, (r) => !r.isSaved());
157+
var resourcesToSave = Array.from(self.loadedResources.values()).filter((r) => !r.isSaved());
175158
var loadingIsFinished = _.isEmpty(resourcesToSave);
176159

177160
if (!loadingIsFinished) {

lib/utils.js renamed to lib/utils/index.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@ var path = require('path');
33
var Promise = require('bluebird');
44
var normalizeUrl = require('normalize-url');
55
var htmlEntities = require('he');
6-
var typeByMime = require('./config/resource-type-by-mime');
7-
var typeByExt = require('./config/resource-type-by-ext');
6+
var typeByMime = require('../config/resource-type-by-mime');
7+
var typeByExt = require('../config/resource-type-by-ext');
88

9-
var logger = require('./logger');
9+
var logger = require('../logger');
1010

1111
var MAX_FILENAME_LENGTH = 255;
1212
var IS_URL = /^((http[s]?:)?\/\/)/;

lib/utils/normalized-url-map.js

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
'use strict';
2+
3+
var normalizeUrl = require('./index').normalizeUrl;
4+
5+
class NormalizedUrlMap extends Map {
6+
get (key) {
7+
return super.get(normalizeUrl(key));
8+
}
9+
10+
set (key, value) {
11+
return super.set(normalizeUrl(key), value);
12+
}
13+
14+
has (key) {
15+
return super.has(normalizeUrl(key));
16+
}
17+
}
18+
19+
module.exports = NormalizedUrlMap;

test/unit/resource-handler/index.test.js

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -177,18 +177,6 @@ describe('ResourceHandler', function() {
177177
});
178178
});
179179

180-
it('should call loadResource with resource returned by requestResource', function () {
181-
pathContainer.getPaths.returns(['http://example.com/child']);
182-
183-
var childResourceRespondedMock = new Resource('http://example.com/child', 'child.png');
184-
scraperContext.requestResource.resolves(childResourceRespondedMock);
185-
186-
return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () {
187-
scraperContext.loadResource.calledOnce.should.be.eql(true);
188-
scraperContext.loadResource.args[0][0].should.be.eql(childResourceRespondedMock);
189-
});
190-
});
191-
192180
it('should update paths in text with local files returned by requestResource', function () {
193181
pathContainer.getPaths.returns([
194182
'http://first.com/img/a.jpg',

test/unit/scraper-test.js

Lines changed: 13 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -120,47 +120,6 @@ describe('Scraper', function () {
120120
});
121121
});
122122

123-
it('should call loadResource for each url', function() {
124-
nock('http://first-url.com').get('/').reply(200, 'OK');
125-
nock('http://second-url.com').get('/').reply(200, 'OK');
126-
127-
var s = new Scraper({
128-
urls: [
129-
'http://first-url.com',
130-
'http://second-url.com'
131-
],
132-
directory: testDirname
133-
});
134-
135-
var loadResourceSpy = sinon.spy(s, 'loadResource');
136-
return s.load().then(function() {
137-
loadResourceSpy.calledTwice.should.be.eql(true);
138-
});
139-
});
140-
141-
it('should not call loadResource if no resource was returned', function() {
142-
nock('http://first-url.com').get('/').reply(200, 'OK');
143-
nock('http://second-url.com').get('/').reply(200, 'OK');
144-
145-
var s = new Scraper({
146-
urls: [
147-
'http://first-url.com',
148-
'http://second-url.com'
149-
],
150-
directory: testDirname
151-
});
152-
153-
var loadResourceSpy = sinon.spy(s, 'loadResource');
154-
var requestStub = sinon.stub(s, 'requestResource');
155-
requestStub.onCall(0).resolves(null);
156-
requestStub.onCall(1).resolves(new Resource('http://second-url.com'));
157-
158-
return s.load().then(function() {
159-
loadResourceSpy.calledOnce.should.be.eql(true);
160-
loadResourceSpy.args[0][0].url.should.be.eql('http://second-url.com');
161-
});
162-
});
163-
164123
it('should return array of objects with url, filename and children', function() {
165124
nock('http://first-url.com').get('/').reply(200, 'OK');
166125
nock('http://second-url.com').get('/').reply(500);
@@ -237,59 +196,34 @@ describe('Scraper', function () {
237196
});
238197
});
239198

240-
describe('#getLoadedResource', function() {
241-
it('should find nothing if no resource with same url was loaded',function() {
242-
var s = new Scraper({
243-
urls: 'http://example.com',
244-
directory: testDirname
245-
});
246-
var a = new Resource('http://first-resource.com');
247-
var loaded = s.getLoadedResource(a.getUrl());
248-
should(loaded).be.eql(undefined);
249-
});
250-
251-
it('should find loaded resource with same url', function() {
199+
describe('#loadResource', function() {
200+
it('should add different resource to the map', function() {
252201
var s = new Scraper({
253202
urls: 'http://example.com',
254203
directory: testDirname
255204
});
256205

257-
var a = new Resource('http://first-resource.com');
258-
s.addLoadedResource(a.getUrl(), a);
206+
var r1 = new Resource('http://example.com/a1.png', 'a1.png');
207+
var r2 = new Resource('http://example.com/a2.png', 'a2.png');
208+
var r3 = new Resource('http://example.com/a3.png', 'a3.png');
259209

260-
var b = new Resource('http://first-resource.com');
261-
var c = new Resource('http://first-resource.com/');
262-
var d = new Resource('http://first-resource.com?');
263-
should(s.getLoadedResource(b.getUrl())).be.equal(a);
264-
should(s.getLoadedResource(c.getUrl())).be.equal(a);
265-
should(s.getLoadedResource(d.getUrl())).be.equal(a);
210+
s.loadResource(r1);
211+
s.loadResource(r2);
212+
s.loadResource(r3);
213+
s.loadedResources.size.should.be.eql(3);
266214
});
267-
});
268215

269-
describe('#loadResource', function() {
270-
it('should not save the same resource twice (should skip already loaded)', function() {
216+
it('should not add the same resource twice', function() {
271217
var s = new Scraper({
272218
urls: 'http://example.com',
273219
directory: testDirname
274220
});
275-
s.getResourceHandler = sinon.stub().returns(_.noop);
276-
277-
sinon.stub(s, 'getLoadedResource')
278-
.withArgs('http://example.com/a.png')
279-
.onFirstCall().returns()
280-
.onSecondCall().returns(Promise.resolve());
281-
282-
sinon.spy(s, 'addLoadedResource');
283221

284222
var r = new Resource('http://example.com/a.png', 'a.png');
285223

286224
s.loadResource(r);
287-
s.getLoadedResource.calledOnce.should.be.eql(true);
288-
s.addLoadedResource.calledOnce.should.be.eql(true);
289-
290225
s.loadResource(r);
291-
s.getLoadedResource.calledTwice.should.be.eql(true);
292-
s.addLoadedResource.calledOnce.should.be.eql(true);
226+
s.loadedResources.size.should.be.eql(1);
293227
});
294228
});
295229

@@ -299,12 +233,12 @@ describe('Scraper', function () {
299233
urls: 'http://example.com',
300234
directory: testDirname
301235
});
302-
s.getResourceHandler = sinon.stub().returns(_.noop);
303-
sinon.spy(s, 'addLoadedResource');
304236

305237
var r = new Resource('http://example.com/a.png', 'a.png');
306238
r.setText('some text');
307239

240+
s.resourceHandler.handleResource = sinon.stub().resolves(r);
241+
308242
return s.saveResource(r).then(function() {
309243
var text = fs.readFileSync(path.join(testDirname, r.getFilename())).toString();
310244
text.should.be.eql(r.getText());

0 commit comments

Comments
 (0)