Skip to content

Commit 8e85c73

Browse files
committed
Fix loading from <img srcset="">, solve #34
1 parent c9de606 commit 8e85c73

File tree

5 files changed

+142
-30
lines changed

5 files changed

+142
-30
lines changed

lib/config/defaults.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ var config = {
55
selector: 'img',
66
attr: 'src'
77
},
8+
{
9+
selector: 'img',
10+
attr: 'srcset'
11+
},
812
{
913
selector: 'input',
1014
attr: 'src'

lib/file-handlers/html.js

Lines changed: 85 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
var cheerio = require('cheerio');
22
var Promise = require('bluebird');
3+
var srcset = require('srcset');
34
var utils = require('../utils');
45

56
function loadHtml (context, resource) {
6-
var sources = context.getHtmlSources();
7-
var handleResources = loadResources.bind(null, context, resource);
8-
7+
var rules = context.getHtmlSources();
98
var p = beforeHandle(resource);
109

11-
sources.forEach(function (src) {
12-
p = p.then(function loadSource () {
13-
return handleResources(src);
10+
rules.forEach(function (rule) {
11+
p = p.then(function loadResources () {
12+
return loadResourcesForRule(context, resource, rule);
1413
});
1514
});
1615
return p;
@@ -37,31 +36,91 @@ function beforeHandle (resource) {
3736
return Promise.resolve(resource);
3837
}
3938

40-
function loadResources (context, resource, source) {
41-
var url = resource.getUrl();
42-
var text = resource.getText();
43-
var filename = resource.getFilename();
44-
var $ = cheerio.load(text);
39+
/**
40+
* @param {HtmlData} htmlData
41+
* @returns {Function} - function which loads resources with given html data
42+
*/
43+
function getResourceLoaderByHtmlData (htmlData) {
44+
if (htmlData.tagName === 'img' && htmlData.attributeName === 'srcset') {
45+
return loadImgSrcsetResource;
46+
}
47+
return loadGeneralResource;
48+
}
4549

46-
var promises = $(source.selector).map(function loadForSelector () {
47-
var el = $(this);
48-
var attr = el.attr(source.attr);
50+
/**
51+
* @param {Object} el - cheerio element
52+
* @param {string} attrName - attribute name
53+
* @returns {HtmlData}
54+
*/
55+
function createHtmlData (el, attrName) {
56+
return {
57+
tagName: el[0].name,
58+
attributeName: attrName,
59+
attributeValue: el.attr(attrName)
60+
}
61+
}
62+
63+
/**
64+
* Download resources from <img srcset="...">
65+
* @param context
66+
* @param {Resource} parentResource
67+
* @param {HtmlData} childResourceHtmlData
68+
* @returns {Promise}
69+
*/
70+
function loadImgSrcsetResource (context, parentResource, childResourceHtmlData) {
71+
var imgScrsetParts = srcset.parse(childResourceHtmlData.attributeValue);
72+
73+
return Promise.mapSeries(imgScrsetParts, function loadImgSrcsetPart (imgScrsetPart) {
74+
var childResourceUrl = utils.getUrl(parentResource.getUrl(), imgScrsetPart.url);
75+
var childResource = parentResource.createChild(childResourceUrl);
76+
childResource.setHtmlData(childResourceHtmlData);
77+
78+
return context.loadResource(childResource).then(function updateSrcsetPart (loadedResource) {
79+
imgScrsetPart.url = loadedResource.getFilename();
80+
});
81+
}).then(function updateSrcset () {
82+
return Promise.resolve(srcset.stringify(imgScrsetParts));
83+
});
84+
}
85+
86+
/**
87+
* Download common resource
88+
* @param context
89+
* @param {Resource} parentResource
90+
* @param {HtmlData} childResourceHtmlData
91+
* @returns {Promise}
92+
*/
93+
function loadGeneralResource (context, parentResource, childResourceHtmlData) {
94+
var attr = childResourceHtmlData.attributeValue;
95+
96+
var resourceUrl = utils.getUrl(parentResource.getUrl(), attr);
97+
var htmlResource = parentResource.createChild(resourceUrl);
98+
htmlResource.setHtmlData(childResourceHtmlData);
4999

50-
if (attr) {
51-
var resourceUrl = utils.getUrl(url, attr);
52-
var htmlResource = resource.createChild(resourceUrl);
53-
htmlResource.setHtmlData({ tagName: el[0].name, attributeName: source.attr });
100+
return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) {
101+
var relativePath = utils.getRelativePath(parentResource.getFilename(), loadedResource.getFilename());
102+
var hash = utils.getHashFromUrl(attr);
54103

55-
return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) {
56-
var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());
57-
var hash = utils.getHashFromUrl(attr);
104+
if (hash && loadedResource.isHtml()) {
105+
relativePath = relativePath.concat(hash);
106+
}
58107

59-
if (hash && loadedResource.isHtml()) {
60-
relativePath = relativePath.concat(hash);
61-
}
108+
return Promise.resolve(relativePath);
109+
});
110+
}
111+
112+
function loadResourcesForRule (context, resource, rule) {
113+
var text = resource.getText();
114+
var $ = cheerio.load(text);
115+
116+
var promises = $(rule.selector).map(function loadForElement () {
117+
var el = $(this);
118+
if (el.attr(rule.attr)) {
119+
var childResourceHtmlData = createHtmlData(el, rule.attr);
120+
var loadResourcesForElement = getResourceLoaderByHtmlData(childResourceHtmlData);
62121

63-
el.attr(source.attr, relativePath);
64-
return Promise.resolve();
122+
return loadResourcesForElement(context, resource, childResourceHtmlData).then(function changeAttr (updatedAttr) {
123+
el.attr(rule.attr, updatedAttr);
65124
});
66125
}
67126
return Promise.reject();

lib/resource.js

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,27 @@ Resource.prototype.setDepth = function setDepth (depth) {
6363
};
6464

6565
/**
66+
* Html Data for resource, represents html element where resource was found
6667
*
67-
* @param {Object} data - html element data
68-
* @param {string} data.tagName - tag name which contain resource
69-
* @param {string} data.attributeName - attribute name with value of resource's url
68+
* @typedef {Object} HtmlData
69+
* @property {string} tagName - tag of element
70+
* @property {string} attributeName - attribute in tag where resource was found
71+
* @property {string} attributeValue - attribute value, contains url of resources
72+
*
73+
* Example: for resource in <img src="/images/foo.png"> it will be
74+
* {
75+
* tagName: 'img',
76+
* attributeName: 'src',
77+
* attributeValue: '/images/foo.png'
78+
* }
79+
*/
80+
81+
/**
82+
*
83+
* @param {HtmlData} data
7084
*/
7185
Resource.prototype.setHtmlData = function setHtmlData (data) {
72-
this.htmlData = data;
86+
this.htmlData = _.pick(data, ['tagName', 'attributeName']);
7387
};
7488

7589
Resource.prototype.getType = function getType () {

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"css-url-parser": "^0.1.0",
3838
"fs-extra": "^0.26.0",
3939
"request": "^2.42.0",
40+
"srcset": "^1.0.0",
4041
"underscore": "^1.7.0"
4142
},
4243
"devDependencies": {

test/unit/file-handlers/html-test.js

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ var defaultScraperOpts = {
1313
directory: testDirname,
1414
sources: [
1515
{ selector: 'img', attr: 'src' },
16+
{ selector: 'img', attr: 'srcset' },
1617
{ selector: 'link[rel="stylesheet"]', attr: 'href' },
1718
{ selector: 'script', attr: 'src'},
1819
{ selector: 'a', attr: 'href' }
@@ -266,5 +267,38 @@ describe('Html handler', function () {
266267
}).catch(done);
267268
});
268269

270+
it('should handle img tag with srcset attribute correctly', function (done) {
271+
272+
var image45Stub = new Resource('http://example.com/image45.jpg', 'local/image45.jpg');
273+
var image150Stub = new Resource('http://example.com/image150.jpg', 'local/image150.jpg');
274+
275+
sinon.stub(scraper, 'loadResource')
276+
.onFirstCall().returns(Promise.resolve(image45Stub))
277+
.onSecondCall().returns(Promise.resolve(image150Stub))
278+
.onThirdCall().returns(Promise.resolve(image45Stub));
279+
280+
var html = '\
281+
<html> \
282+
<body> \
283+
<img src="http://example.com/image45.jpg" \
284+
srcset="http://example.com/image150.jpg 150w, http://example.com/image45.jpg 45w" \
285+
sizes="(max-width: 45px) 100vw, 45px" width="45" height="45"> \
286+
</body> \
287+
</html>\
288+
';
289+
290+
var po = new Resource('http://example.com', 'index.html');
291+
po.setText(html);
292+
293+
return loadHtml(scraper, po).then(function(){
294+
var text = po.getText();
295+
text.should.not.containEql('http://example.com/image45.jpg');
296+
text.should.not.containEql('http://example.com/image150.jpg');
297+
text.should.containEql('src="local/image45.jpg"');
298+
text.should.containEql('srcset="local/image150.jpg 150w, local/image45.jpg 45w"');
299+
done();
300+
}).catch(done);
301+
});
302+
269303
});
270304
});

0 commit comments

Comments
 (0)