Skip to content

Commit 874e560

Browse files
authored
Update missing sources (#217)
1 parent c4e191b commit 874e560

File tree

18 files changed

+729
-226
lines changed

18 files changed

+729
-226
lines changed

README.md

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ scrape(options, (error, result) => {
6262
* [resourceSaver](#resourcesaver) - customize resources saving
6363
* [onResourceSaved](#onresourcesaved) - callback called when resource is saved
6464
* [onResourceError](#onresourceerror) - callback called when resource's downloading is failed
65+
* [updateMissingSources](#updatemissingsources) - update url for missing sources with absolute url
6566

6667
Default options you can find in [lib/config/defaults.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/config/defaults.js) or get them using `scrape.defaults`.
6768

@@ -145,7 +146,7 @@ String, filename for index page. Defaults to `index.html`.
145146
Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`.
146147

147148
#### ignoreErrors
148-
Boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error. Defaults to `true`.
149+
Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `true`.
149150

150151
#### urlFilter
151152
Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied.
@@ -253,6 +254,30 @@ scrape({
253254
})
254255
```
255256

257+
#### updateMissingSources
258+
Boolean, if `true` scraper will set absolute urls for all failing `sources`, if `false` - it will leave them as is (which may cause incorrectly displayed page).
259+
Also can contain array of `sources` to update (structure is similar to [sources](#sources)).
260+
Defaults to `false`.
261+
```javascript
262+
// update all failing img srcs with absolute url
263+
scrape({
264+
urls: ['http://example.com/'],
265+
directory: '/path/to/save',
266+
sources: [{selector: 'img', attr: 'src'}],
267+
updateMissingSources: true
268+
});
269+
270+
// download nothing, just update all img srcs with absolute urls
271+
scrape({
272+
urls: ['http://example.com/'],
273+
directory: '/path/to/save',
274+
sources: [],
275+
updateMissingSources: [{selector: 'img', attr: 'src'}]
276+
});
277+
278+
```
279+
280+
256281
## callback
257282
Callback function, optional, includes following parameters:
258283
- `error`: if error - `Error` object, if success - `null`

lib/config/defaults.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ const config = {
5656
httpResponseHandler: null,
5757
onResourceSaved: null,
5858
onResourceError: null,
59-
resourceSaver: null
59+
resourceSaver: null,
60+
updateMissingSources: false
6061
};
6162

6263
module.exports = config;

lib/resource-handler/css/index.js

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
1-
var CssText = require('./../path-containers/css-text');
1+
'use strict';
22

3-
function CssResourceHandler (options, handleChildrenPaths) {
4-
this.options = options;
5-
this.handleChildrenPaths = handleChildrenPaths;
6-
}
3+
const CssText = require('./../path-containers/css-text');
4+
5+
class CssResourceHandler {
6+
constructor (options, methods) {
7+
this.options = options;
8+
this.downloadChildrenPaths = methods.downloadChildrenPaths;
9+
this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources);
10+
}
711

8-
CssResourceHandler.prototype.handle = function handle (resource) {
9-
var pathContainer = new CssText(resource.getText());
10-
return this.handleChildrenPaths(pathContainer, resource).then(function updateText (updatedText) {
11-
resource.setText(updatedText);
12-
return resource;
13-
});
14-
};
12+
handle (resource) {
13+
const pathContainer = new CssText(resource.getText());
14+
return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) {
15+
resource.setText(updatedText);
16+
return resource;
17+
});
18+
}
19+
}
1520

1621
module.exports = CssResourceHandler;

lib/resource-handler/html/index.js

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,30 @@ const logger = require('../../logger');
77
const HtmlSourceElement = require('./html-source-element');
88

99
class HtmlResourceHandler {
10-
constructor (options, handleChildrenPaths) {
10+
constructor (options, methods) {
1111
this.options = options;
12-
this.handleChildrenPaths = handleChildrenPaths;
12+
this.downloadChildrenPaths = methods.downloadChildrenPaths;
13+
this.updateChildrenPaths = methods.updateChildrenPaths;
14+
15+
this.recursiveSources = this.options.recursiveSources || [];
16+
this.downloadSources = this.options.sources;
17+
this.updateSources = [];
18+
19+
if (this.options.updateMissingSources === true) {
20+
this.updateSources = this.downloadSources;
21+
} else if (Array.isArray(this.options.updateMissingSources)) {
22+
this.updateSources = this.options.updateMissingSources;
23+
}
24+
25+
this.allSources = utils.union(this.downloadSources, this.updateSources);
1326
}
1427

1528
handle (resource) {
1629
const $ = loadTextToCheerio(resource.getText());
1730
prepareToLoad($, resource);
1831

19-
return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource))
20-
.then(function updateResource () {
32+
return Promise.mapSeries(this.allSources, this.loadResourcesForRule.bind(this, $, resource))
33+
.then(() => {
2134
resource.setText($.html());
2235
return resource;
2336
});
@@ -27,31 +40,53 @@ class HtmlResourceHandler {
2740
const self = this;
2841
const promises = $(rule.selector).map((i, element) => {
2942
const el = new HtmlSourceElement($(element), rule);
43+
const pathContainer = el.getPathContainer();
3044

31-
const isRecursive = self.options.recursiveSources && Boolean(el.findMatchedRule(self.options.recursiveSources));
32-
const isDepthGreaterThanMax = self.options.maxRecursiveDepth && parentResource.getDepth() >= self.options.maxRecursiveDepth;
33-
if (isRecursive && isDepthGreaterThanMax) {
45+
if (!pathContainer) {
46+
return Promise.resolve(null);
47+
}
48+
49+
const needToDownloadElement = this.needToDownload(el);
50+
const needToUpdateElement = this.needToUpdate(el);
51+
52+
if (this.exceedMaxRecursiveDepth(el, parentResource)) {
3453
logger.debug(`filtering out ${el} by max recursive depth`);
35-
return Promise.resolve();
54+
return self.updateChildrenPaths(pathContainer, parentResource, needToUpdateElement).then(el.setData.bind(el));
3655
}
3756

38-
const pathContainer = el.getPathContainer();
39-
if (!pathContainer) {
40-
return Promise.resolve();
57+
if (!needToDownloadElement) {
58+
return self.updateChildrenPaths(pathContainer, parentResource, needToUpdateElement).then(el.setData.bind(el));
4159
}
42-
return self.handleChildrenPaths(pathContainer, parentResource).then((updatedText) => {
43-
el.setData(updatedText);
44-
el.removeIntegrityCheck();
45-
});
60+
61+
return self.downloadChildrenPaths(pathContainer, parentResource, needToUpdateElement)
62+
.then((updatedText) => {
63+
el.setData(updatedText);
64+
el.removeIntegrityCheck();
65+
});
66+
4667
}).get();
4768

4869
return utils.waitAllFulfilled(promises);
4970
}
71+
72+
exceedMaxRecursiveDepth (el, parentResource) {
73+
const isRecursive = Boolean(el.findMatchedRule(this.recursiveSources));
74+
const isDepthGreaterThanMax = this.options.maxRecursiveDepth && parentResource.getDepth() >= this.options.maxRecursiveDepth;
75+
return isRecursive && isDepthGreaterThanMax;
76+
}
77+
78+
needToDownload (el) {
79+
return Boolean(el.findMatchedRule(this.downloadSources));
80+
}
81+
82+
needToUpdate (el) {
83+
return Boolean(el.findMatchedRule(this.updateSources));
84+
}
5085
}
5186

5287
function prepareToLoad ($, resource) {
53-
$('base').each(function handleBaseTag () {
54-
const el = $(this);
88+
$('base').each((i, element) => {
89+
const el = $(element);
5590
const href = el.attr('href');
5691
if (href) {
5792
const newUrl = utils.getUrl(resource.getUrl(), href);

lib/resource-handler/index.js

Lines changed: 94 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,109 @@
1-
var _ = require('lodash');
2-
var Promise = require('bluebird');
3-
var logger = require('../logger');
4-
var utils = require('../utils');
1+
'use strict';
52

6-
var HtmlHandler = require('./html');
7-
var CssHandler = require('./css');
3+
const _ = require('lodash');
4+
const Promise = require('bluebird');
5+
const logger = require('../logger');
6+
const utils = require('../utils');
87

9-
var supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename'];
8+
const HtmlHandler = require('./html');
9+
const CssHandler = require('./css');
1010

11-
function ResourceHandler (options, context) {
12-
this.options = _.pick(options, supportedOptions);
13-
this.context = context;
11+
const supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename', 'updateMissingSources'];
1412

15-
this.htmlHandler = new HtmlHandler(this.options, this.handleChildrenResources.bind(this));
16-
this.cssHandler = new CssHandler(this.options, this.handleChildrenResources.bind(this));
17-
}
13+
class ResourceHandler {
14+
constructor (options, context) {
15+
this.options = _.pick(options, supportedOptions);
16+
this.context = context;
17+
18+
const methods = {
19+
downloadChildrenPaths: this.downloadChildrenResources.bind(this),
20+
updateChildrenPaths: this.updateChildrenResources.bind(this)
21+
};
1822

19-
ResourceHandler.prototype.getResourceHandler = function getResourceHandler (resource) {
20-
switch (true) {
21-
case resource.isCss():
22-
logger.debug('using css handler for ' + resource);
23-
return this.cssHandler;
24-
case resource.isHtml():
25-
logger.debug('using html handler for ' + resource);
26-
return this.htmlHandler;
27-
default:
28-
logger.debug('using no handler for ' + resource);
29-
return null;
23+
this.htmlHandler = new HtmlHandler(this.options, methods);
24+
this.cssHandler = new CssHandler(this.options, methods);
3025
}
31-
};
32-
33-
/**
34-
* Request all resources from pathContainers paths
35-
* @param pathContainer - instance of ImgSrcsetTag or CommonTag or CssText, contains original paths for resources
36-
* @param {Resource} parentResource
37-
* @returns {Promise} - resolved when all resources from pathContainer were requested
38-
* and original paths in parentResource were updated with local paths for children resources
39-
*/
40-
ResourceHandler.prototype.handleChildrenResources = function handleChildrenResources (pathContainer, parentResource) {
41-
var self = this;
42-
var childrenPaths = pathContainer.getPaths();
43-
var pathsToUpdate = [];
44-
45-
var childrenPromises = childrenPaths.map(function loadChildPath (childPath) {
46-
var childResourceUrl = utils.getUrl(parentResource.getUrl(), childPath);
47-
var childResource = parentResource.createChild(childResourceUrl);
48-
49-
return self.context.requestResource(childResource).then(function updateChildPath (respondedResource) {
50-
if (respondedResource) {
51-
parentResource.updateChild(childResource, respondedResource);
52-
53-
var relativePath = utils.getRelativePath(parentResource.getFilename(), respondedResource.getFilename());
54-
if (self.options.prettifyUrls) {
55-
relativePath = relativePath.replace(self.options.defaultFilename, '');
56-
}
57-
var hash = utils.getHashFromUrl(childPath);
5826

59-
if (hash) {
60-
relativePath = relativePath.concat(hash);
27+
getResourceHandler (resource) {
28+
switch (true) {
29+
case resource.isCss():
30+
logger.debug('using css handler for ' + resource);
31+
return this.cssHandler;
32+
case resource.isHtml():
33+
logger.debug('using html handler for ' + resource);
34+
return this.htmlHandler;
35+
default:
36+
logger.debug('using no handler for ' + resource);
37+
return null;
38+
}
39+
}
40+
41+
/**
42+
* Request all resources from pathContainers paths
43+
* @param pathContainer - instance of ImgSrcsetTag or CommonTag or CssText, contains original paths for resources
44+
* @param {Resource} parentResource
45+
* @param {boolean} updateIfFailed - if true - failed resources will be updated with absolute links
46+
* @returns {Promise} - resolved when all resources from pathContainer were requested
47+
* and original paths in parentResource were updated with local paths for children resources
48+
*/
49+
downloadChildrenResources (pathContainer, parentResource, updateIfFailed) {
50+
const self = this;
51+
const childrenPaths = pathContainer.getPaths();
52+
const pathsToUpdate = [];
53+
54+
const childrenPromises = childrenPaths.map((childPath) => {
55+
const childResourceUrl = utils.getUrl(parentResource.getUrl(), childPath);
56+
const childResource = parentResource.createChild(childResourceUrl);
57+
58+
return self.context.requestResource(childResource).then((respondedResource) => {
59+
if (respondedResource) {
60+
parentResource.updateChild(childResource, respondedResource);
61+
62+
let relativePath = utils.getRelativePath(parentResource.getFilename(), respondedResource.getFilename());
63+
if (self.options.prettifyUrls) {
64+
relativePath = relativePath.replace(self.options.defaultFilename, '');
65+
}
66+
const hash = utils.getHashFromUrl(childPath);
67+
68+
if (hash) {
69+
relativePath = relativePath.concat(hash);
70+
}
71+
72+
pathsToUpdate.push({ oldPath: childPath, newPath: relativePath});
73+
} else {
74+
if (updateIfFailed) {
75+
pathsToUpdate.push({ oldPath: childPath, newPath: childResourceUrl});
76+
}
6177
}
78+
return null; // Prevent Bluebird warnings
79+
});
80+
});
6281

63-
pathsToUpdate.push({ oldPath: childPath, newPath: relativePath});
64-
}
65-
return null; // Prevent Bluebird warnings
82+
return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () {
83+
return pathContainer.updateText(pathsToUpdate);
6684
});
67-
});
85+
}
6886

69-
return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () {
70-
return pathContainer.updateText(pathsToUpdate);
71-
});
72-
};
87+
updateChildrenResources (pathContainer, parentResource, needToUpdate) {
88+
if (!needToUpdate) {
89+
return Promise.resolve(pathContainer.updateText([]));
90+
}
91+
const parentUrl = parentResource.getUrl();
92+
const pathsToUpdate = [];
93+
pathContainer.getPaths().forEach((path) => {
94+
const childAbsoluteUrl = utils.getUrl(parentUrl, path);
95+
pathsToUpdate.push({ oldPath: path, newPath: childAbsoluteUrl });
96+
});
97+
return Promise.resolve(pathContainer.updateText(pathsToUpdate));
98+
}
7399

74-
ResourceHandler.prototype.handleResource = function handleResource (resource) {
75-
var resourceHandler = this.getResourceHandler(resource);
76-
if (resourceHandler && resourceHandler.handle) {
77-
return resourceHandler.handle(resource);
100+
handleResource (resource) {
101+
const resourceHandler = this.getResourceHandler(resource);
102+
if (resourceHandler && resourceHandler.handle) {
103+
return resourceHandler.handle(resource);
104+
}
105+
return Promise.resolve(resource);
78106
}
79-
return Promise.resolve(resource);
80-
};
107+
}
81108

82109
module.exports = ResourceHandler;

0 commit comments

Comments
 (0)