Skip to content

Commit 479b8ec

Browse files
Mark Lagendijks0ph1e
authored andcommitted
Improved mechanism for re-using already loaded / loading resources. (#78)
* Improved mechanism for re-using already loaded / loading resources. * Rewrote loading logic: 1. `requestResource` executes the request for the resource, and saves the response. Processing multiple times is prevented by maintaining `respondedResourcePromises`. 2. `loadResource` calls `requestResource` and saves the resource to disk. Processing multiple times is prevented by maintaining `loadedResourcePromises`. 3. To prevent deadlocks `file-handlers/html.js` only waits for `requestResource`, not for `loadResource`. 4. To determine when the scraper is finished `loadedResourcePromises` is used. * Fix bluebird warning * Code style fixes * Fixed bug * Style fix * Set filename when response was recieved, to always use the responded filename * Cleanup * Update scraper.js Added `waitForLoad` explanation.
1 parent b6c64a9 commit 479b8ec

File tree

7 files changed

+220
-115
lines changed

7 files changed

+220
-115
lines changed

lib/file-handlers/html.js

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,16 @@ function loadImgSrcsetResource (context, parentResource, childResourceHtmlData)
7979
var childResourceUrl = utils.getUrl(parentResource.getUrl(), imgScrsetPart.url);
8080
var childResource = parentResource.createChild(childResourceUrl);
8181
childResource.setHtmlData(childResourceHtmlData);
82-
83-
return context.loadResource(childResource).then(function updateSrcsetPart (loadedResource) {
84-
if(loadedResource){
85-
parentResource.updateChild(childResource, loadedResource);
86-
imgScrsetPart.url = loadedResource.getFilename();
82+
83+
return context.requestResource(childResource).then(function updateSrcsetPart (respondedResource) {
84+
if(respondedResource){
85+
parentResource.updateChild(childResource, respondedResource);
86+
imgScrsetPart.url = respondedResource.getFilename();
87+
88+
// Do not wait for loadResource here, to prevent deadlock, see scraper.waitForLoad
89+
context.loadResource(childResource);
8790
}
91+
return null; // Prevent Bluebird warnings
8892
});
8993
}).then(function updateSrcset () {
9094
return Promise.resolve(srcset.stringify(imgScrsetParts));
@@ -102,25 +106,28 @@ function loadGeneralResource (context, parentResource, childResourceHtmlData) {
102106
var attr = childResourceHtmlData.attributeValue;
103107

104108
var resourceUrl = utils.getUrl(parentResource.getUrl(), attr);
105-
var htmlResource = parentResource.createChild(resourceUrl);
106-
htmlResource.setHtmlData(childResourceHtmlData);
109+
var childResource = parentResource.createChild(resourceUrl);
110+
childResource.setHtmlData(childResourceHtmlData);
107111

108-
return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) {
109-
if(!loadedResource){
112+
return context.requestResource(childResource).then(function handleLoadedSource (respondedResource) {
113+
if(!respondedResource){
110114
return attr;
111115
}
112-
parentResource.updateChild(htmlResource, loadedResource);
116+
parentResource.updateChild(childResource, respondedResource);
113117

114-
var relativePath = utils.getRelativePath(parentResource.getFilename(), loadedResource.getFilename());
118+
var relativePath = utils.getRelativePath(parentResource.getFilename(), respondedResource.getFilename());
115119
if(context.options.prettifyUrls){
116120
relativePath = relativePath.replace(context.options.defaultFilename, '');
117121
}
118122
var hash = utils.getHashFromUrl(attr);
119123

120-
if (hash && loadedResource.isHtml()) {
124+
if (hash && respondedResource.isHtml()) {
121125
relativePath = relativePath.concat(hash);
122126
}
123127

128+
// Do not wait for loadResource here, to prevent deadlock, scraper.waitForLoad
129+
context.loadResource(childResource);
130+
124131
return relativePath;
125132
});
126133
}

lib/filename-generators/by-type.js

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ var path = require('path');
33
var utils = require('../utils.js');
44
var defaultExtensions = require('../config/resource-extensions-by-type');
55

6-
module.exports = function generateFilename (resource, options, loadedResources) {
7-
var occupiedFilenames = getOccupiedFilenames(loadedResources, options);
6+
module.exports = function generateFilename (resource, options, occupiedFileNames) {
7+
var occupiedNames = getSubDirectoryNames(options).concat(occupiedFileNames);
88

99
var filename = getFilenameForResource(resource, options);
1010
var extension = utils.getFilenameExtension(filename);
@@ -14,7 +14,7 @@ module.exports = function generateFilename (resource, options, loadedResources)
1414
var basename = path.basename(filename, extension);
1515
var index = 1;
1616

17-
while (_.includes(occupiedFilenames, currentFilename)) {
17+
while (_.includes(occupiedNames, currentFilename)) {
1818
currentFilename = path.join(directory, basename + '_' + index + extension);
1919
index++;
2020
}
@@ -37,10 +37,8 @@ function getFilenameForResource (resource, options) {
3737
return filename;
3838
}
3939

40-
function getOccupiedFilenames (loadedResources, options) {
41-
var subdirectories = _.map(options.subdirectories, function getDirectory (directory) { return directory.directory; });
42-
var loadedFiles = _.map(loadedResources, function getFileName (resource) { return resource.getFilename(); });
43-
return subdirectories.concat(loadedFiles);
40+
function getSubDirectoryNames (options) {
41+
return _.map(options.subdirectories, function getDirectory (directory) { return directory.directory; });
4442
}
4543

4644
function getDirectoryByExtension (extension, options) {

lib/scraper.js

Lines changed: 115 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ var Resource = require('./resource');
1414

1515
var getFilenameGenerator = require('./filename-generators/filename-generator-getter');
1616
var makeRequest = require('./request');
17-
var compareUrls = require('compare-urls');
17+
var normalizeUrl = require('normalize-url');
1818

1919
var loadHtml = require('./file-handlers/html');
2020
var loadCss = require('./file-handlers/css');
@@ -26,24 +26,49 @@ function loadHtmlAndCss (context, po) {
2626

2727
function Scraper (options) {
2828
this.originalResources = [];
29+
this.occupiedFileNames = [];
2930
this.loadedResources = [];
31+
this.respondedResourcePromises = {};
32+
this.loadedResourcePromises = {};
3033

3134
this.options = _.extend({}, defaults, options);
3235
this.options.request = _.extend({}, defaults.request, options.request);
3336
this.options.directory = path.resolve(process.cwd(), this.options.directory || '');
3437
this.options.filenameGenerator = getFilenameGenerator(this.options.filenameGenerator);
3538
}
3639

37-
Scraper.prototype.getLoadedResource = function getLoadedResource (resource) {
38-
return _.find(this.loadedResources, function checkUrlsEqual (lr) {
39-
return compareUrls(resource.getUrl(), lr.getUrl());
40-
});
40+
Scraper.prototype.addOccupiedFileName = function addOccupiedFileName (filename) {
41+
this.occupiedFileNames.push(filename);
42+
};
43+
44+
Scraper.prototype.getOccupiedFileNames = function getOccupiedFileNames () {
45+
return this.occupiedFileNames;
4146
};
4247

4348
Scraper.prototype.addLoadedResource = function addLoadedResource (resource) {
4449
this.loadedResources.push(resource);
4550
};
4651

52+
Scraper.prototype.addRespondedResourcePromise = function addRespondedResourcePromise (url, promise) {
53+
url = normalizeUrl(url);
54+
this.respondedResourcePromises[url] = promise;
55+
};
56+
57+
Scraper.prototype.getRespondedResourcePromise = function getRespondedResourcePromise (url) {
58+
url = normalizeUrl(url);
59+
return this.respondedResourcePromises[url];
60+
};
61+
62+
Scraper.prototype.addLoadedResourcePromise = function addLoadedResourcePromise (url, promise) {
63+
url = normalizeUrl(url);
64+
this.loadedResourcePromises[url] = promise;
65+
};
66+
67+
Scraper.prototype.getLoadedResourcePromise = function getLoadedResourcePromise (url) {
68+
url = normalizeUrl(url);
69+
return this.loadedResourcePromises[url];
70+
};
71+
4772
Scraper.prototype.getHtmlSources = function getHtmlSources () {
4873
return this.options.sources;
4974
};
@@ -53,46 +78,81 @@ Scraper.prototype.getResourceHandler = function getHandler (resource) {
5378
var depth = resource.getDepth();
5479
var depthGreaterThanMax = self.options.maxDepth && depth >= self.options.maxDepth;
5580

56-
switch (true) {
57-
case depthGreaterThanMax: return _.noop;
58-
case resource.isCss(): return loadCss;
59-
case resource.isHtml(): return loadHtmlAndCss;
60-
default: return _.noop;
81+
switch(true) {
82+
case depthGreaterThanMax:
83+
return _.noop;
84+
case resource.isCss():
85+
return loadCss;
86+
case resource.isHtml():
87+
return loadHtmlAndCss;
88+
default:
89+
return _.noop;
6190
}
6291
};
6392

6493
Scraper.prototype.loadResource = function loadResource (resource) {
6594
var self = this;
95+
var url = resource.getUrl();
6696

67-
if(!self.options.urlFilter(resource.url)){
97+
if(!self.options.urlFilter(url)) {
6898
return Promise.resolve(null);
6999
}
70100

71-
// try to find already loaded
72-
var loaded = self.getLoadedResource(resource);
101+
var loadedResourcePromise = self.getLoadedResourcePromise(url);
102+
if(loadedResourcePromise) {
103+
return loadedResourcePromise;
104+
}
73105

74-
if (!loaded) {
75-
var url = resource.getUrl();
76-
var filename = self.options.filenameGenerator(resource, self.options, self.loadedResources);
77-
resource.setFilename(filename);
106+
var respondedResourcePromise = self.requestResource(resource);
107+
108+
loadedResourcePromise = respondedResourcePromise.then(function handleResponse (resource) {
109+
return Promise.resolve()
110+
.then(function handleResource () {
111+
var resourceHandler = self.getResourceHandler(resource);
112+
return resourceHandler(self, resource);
113+
}).then(function fileHandled () {
114+
var filename = path.join(self.options.directory, resource.getFilename());
115+
var text = resource.getText();
116+
return outputFileAsync(filename, text, { encoding: 'binary' });
117+
}).then(function fileSaved () {
118+
self.addLoadedResourcePromise(resource.getUrl(), loadedResourcePromise);
119+
self.addLoadedResource(resource);
120+
return resource;
121+
});
122+
});
123+
self.addLoadedResourcePromise(url, loadedResourcePromise);
124+
125+
return loadedResourcePromise;
126+
};
127+
128+
Scraper.prototype.requestResource = function requestResource (resource) {
129+
var self = this;
130+
var url = resource.getUrl();
131+
132+
if(!self.options.urlFilter(url)) {
133+
return Promise.resolve(null);
134+
}
78135

79-
self.addLoadedResource(resource);
80-
81-
// Request -> processing -> save to fs
82-
return self.makeRequest(url).then(function requestCompleted (data) {
83-
resource.setUrl(data.url); // Url may be changed in redirects
84-
resource.setText(data.body);
85-
var handleFile = self.getResourceHandler(resource);
86-
return handleFile(self, resource);
87-
}).then(function fileHandled () {
88-
var filename = path.join(self.options.directory, resource.getFilename());
89-
var text = resource.getText();
90-
return outputFileAsync(filename, text, { encoding: 'binary' });
91-
}).then(function fileSaved () {
92-
return Promise.resolve(resource);
93-
});
136+
var respondedResourcePromise = self.getRespondedResourcePromise(url);
137+
if(respondedResourcePromise) {
138+
return respondedResourcePromise;
94139
}
95-
return Promise.resolve(loaded);
140+
141+
var promise = self.makeRequest(url).then(function requestCompleted (data) {
142+
// Url may be changed in redirects
143+
resource.setUrl(data.url);
144+
145+
var filename = self.options.filenameGenerator(resource, self.options, self.getOccupiedFileNames());
146+
resource.setFilename(filename);
147+
self.addOccupiedFileName(filename);
148+
self.addRespondedResourcePromise(data.url, promise);
149+
150+
resource.setText(data.body);
151+
return resource;
152+
});
153+
self.addRespondedResourcePromise(url, promise);
154+
155+
return promise;
96156
};
97157

98158
Scraper.prototype.validate = function validate () {
@@ -118,7 +178,7 @@ Scraper.prototype.prepare = function prepare () {
118178
return new Resource(url, filename);
119179
});
120180

121-
if (self.options.recursive) {
181+
if(self.options.recursive) {
122182
self.options.sources = _.union(self.options.sources, recursiveSources);
123183
}
124184

@@ -127,9 +187,28 @@ Scraper.prototype.prepare = function prepare () {
127187

128188
Scraper.prototype.load = function load () {
129189
var self = this;
130-
return Promise.map(self.originalResources, function loadResource (resource) {
131-
return self.loadResource(resource);
190+
_.forEach(self.originalResources, function loadResource (resource) {
191+
self.loadResource(resource);
132192
});
193+
return self.waitForLoad(0);
194+
};
195+
196+
// Returns a promise which gets resolved when all resources are loaded.
197+
// Determines whether loading is done by:
198+
// 1. Waiting until all loadedResourcePromises are resolved.
199+
// 2. Recursing itself if any new loadedResourcePromises (promises for the loading of childResources) where added during this time. If not, loading is done.
200+
Scraper.prototype.waitForLoad = function waitForLoad (previousCount) {
201+
var self = this;
202+
var count = _.size(self.loadedResourcePromises);
203+
if(count === previousCount) {
204+
return Promise.all(_.map(self.originalResources, function getLoadedResource (resource) {
205+
return self.getLoadedResourcePromise(resource.getUrl());
206+
}));
207+
} else{
208+
return Promise
209+
.all(_.toArray(self.loadedResourcePromises))
210+
.then(self.waitForLoad.bind(self, count));
211+
}
133212
};
134213

135214
Scraper.prototype.errorCleanup = function errorCleanup (error) {

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@
3434
"dependencies": {
3535
"bluebird": "^3.0.1",
3636
"cheerio": "0.20.0",
37-
"compare-urls": "^1.0.0",
3837
"css-url-parser": "^1.0.0",
3938
"fs-extra": "^0.30.0",
4039
"lodash": "^4.11.1",
40+
"normalize-url": "^1.5.3",
4141
"request": "^2.42.0",
4242
"srcset": "^1.0.0"
4343
},

0 commit comments

Comments
 (0)