Skip to content

Commit a2b5c56

Browse files
authored
Add referer and decode html entities (#171)
* Decode html entities in html resources * Add referer to request * Add unit tests * Add functional test for decoding html entities * Add missing newlines
1 parent 06cf3a2 commit a2b5c56

File tree

12 files changed

+203
-48
lines changed

12 files changed

+203
-48
lines changed

lib/request.js

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,25 @@ var _ = require('lodash');
22
var Promise = require('bluebird');
33
var request = require('request');
44
var get = Promise.promisify(request.get);
5+
var logger = require('./logger');
56

67
function getMimeType (contentType) {
78
return contentType ? contentType.split(';')[0] : null;
89
}
910

10-
function makeRequest (options, url) {
11+
function makeRequest (options, url, referer) {
1112
var requestOptions = _.clone(options);
1213
requestOptions.url = url;
1314

15+
if (referer) {
16+
requestOptions.headers = requestOptions.headers || {};
17+
requestOptions.headers.referer = referer;
18+
}
19+
20+
logger.debug(`[request] sending request for url ${url}, referer ${referer}`);
21+
1422
return get(requestOptions).then(function handleResponse (data) {
23+
logger.debug(`[request] received response for ${data.request.href}, statusCode ${data.statusCode}`);
1524
return {
1625
url: data.request.href,
1726
mimeType: getMimeType(data.headers['content-type']),

lib/resource-handler/html/html-source-element.js

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ var ImgSrcsetTag = require('../path-containers/html-img-srcset-tag');
22
var CommonTag = require('../path-containers/html-common-tag');
33
var CssText = require('../path-containers/css-text');
44
var _ = require('lodash');
5+
var utils = require('../../utils');
56

67
var pathContainersByRule = [
78
{ selector: '[style]', attr: 'style', containerClass: CssText },
@@ -22,8 +23,13 @@ function HtmlSourceElement (el, rule) {
2223
this.rule = rule;
2324
}
2425

26+
/**
27+
* Get text from attr or from innerHtml of element based on rule
28+
* @returns {string}
29+
*/
2530
HtmlSourceElement.prototype.getData = function getData () {
26-
return this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text();
31+
var text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text();
32+
return utils.decodeHtmlEntities(text);
2733
};
2834

2935
HtmlSourceElement.prototype.setData = function setData (newData) {

lib/scraper.js

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,13 +111,12 @@ Scraper.prototype.requestResource = function requestResource (resource) {
111111

112112
respondedResourcePromise = Promise.resolve()
113113
.then(function makeRequest () {
114-
logger.debug('requesting ' + url);
115-
return self.makeRequest(url);
114+
var referer = resource.parent ? resource.parent.getUrl() : null;
115+
return self.makeRequest(url, referer);
116116
}).then(function requestCompleted (responseData) {
117-
logger.debug('received response for ' + url);
118117

119118
if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects
120-
logger.debug('url changed. old url = ' + url + ', new ulr = ' + responseData.url);
119+
logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url);
121120
var respondedNewUrlResource = self.getRespondedResourcePromise(responseData.url);
122121
if (respondedNewUrlResource) {
123122
return respondedNewUrlResource;
@@ -138,8 +137,6 @@ Scraper.prototype.requestResource = function requestResource (resource) {
138137
}
139138

140139
resource.setText(responseData.body);
141-
142-
logger.debug('finish request for ' + resource);
143140
return resource;
144141
}).catch(function handleError (err) {
145142
logger.warn('failed to request resource ' + resource);

lib/utils.js

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ var url = require('url');
22
var path = require('path');
33
var Promise = require('bluebird');
44
var normalizeUrl = require('normalize-url');
5+
var htmlEntities = require('he');
56
var typeByMime = require('./config/resource-type-by-mime');
67
var typeByExt = require('./config/resource-type-by-ext');
78

@@ -102,6 +103,10 @@ function getTypeByFilename (filename) {
102103
return typeByExt[ext];
103104
}
104105

106+
function decodeHtmlEntities (text) {
107+
return typeof text === 'string' ? htmlEntities.decode(text) : '';
108+
}
109+
105110
module.exports = {
106111
isUrl: isUrl,
107112
getUrl: getUrl,
@@ -117,5 +122,6 @@ module.exports = {
117122
urlsEqual: urlsEqual,
118123
isUriSchemaSupported: isUriSchemaSupported,
119124
getTypeByMime: getTypeByMime,
120-
getTypeByFilename: getTypeByFilename
125+
getTypeByFilename: getTypeByFilename,
126+
decodeHtmlEntities: decodeHtmlEntities
121127
};

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
"css-url-parser": "^1.0.0",
3939
"debug": "^2.4.5",
4040
"fs-extra": "^1.0.0",
41+
"he": "^1.1.0",
4142
"lodash": "^4.11.1",
4243
"normalize-url": "^1.5.3",
4344
"request": "^2.42.0",
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
var should = require('should');
2+
var nock = require('nock');
3+
var fs = require('fs-extra');
4+
var Scraper = require('../../../lib/scraper');
5+
6+
var testDirname = __dirname + '/.tmp';
7+
var mockDirname = __dirname + '/mocks';
8+
9+
describe('Functional: html entities in url', function() {
10+
11+
beforeEach(function() {
12+
nock.cleanAll();
13+
nock.disableNetConnect();
14+
});
15+
16+
afterEach(function() {
17+
nock.cleanAll();
18+
nock.enableNetConnect();
19+
fs.removeSync(testDirname);
20+
});
21+
22+
it('should decode all html-entities found in html files and not encode entities from css file', function() {
23+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
24+
nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css');
25+
26+
// in index.html
27+
// /fonts?family=Myriad&v=2 => /fonts?family=Myriad&v=2
28+
nock('http://example.com/').get('/fonts?family=Myriad&v=2').reply(200, 'fonts.css', {'content-type': 'text/css'});
29+
// /?a=1&style-attr.png => /?a=1&style-attr.png
30+
nock('http://example.com/').get('/style-attr.png?a=1&style-attr.png').reply(200, 'style-attr.png', {'content-type': 'text/css'});
31+
// /?a=1&b=2 => /?a=1&b=2
32+
nock('http://example.com/').get('/img.png?a=1&b=2').reply(200, 'img.png');
33+
// /test?b=2&c=3&d=4 => /test?b=2&c=3&d=4
34+
nock('http://example.com/').get('/?b=2&c=3&d=4').reply(200, 'index_1.html', {'content-type': 'text/html'});
35+
36+
// in style.css
37+
// /?v=2&name=external-style.png should stay not decoded
38+
nock('http://example.com/').get('/external-style.png?v=2&name=external-style.png').reply(200, 'external-style.png');
39+
40+
var options = {
41+
urls: [ 'http://example.com/' ],
42+
directory: testDirname,
43+
maxDepth: 2,
44+
recursive: true,
45+
subdirectories: [
46+
{ directory: 'local', extensions: ['.png', '.css'] }
47+
],
48+
ignoreErrors: false
49+
};
50+
var scraper = new Scraper(options);
51+
52+
return scraper.scrape(options).then(function() {
53+
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
54+
var indexHtml = fs.readFileSync(testDirname + '/index.html').toString();
55+
56+
should(indexHtml).containEql('href="local/fonts.css');
57+
fs.existsSync(testDirname + '/local/fonts.css').should.be.eql(true);
58+
should(fs.readFileSync(testDirname + '/local/fonts.css').toString()).be.eql('fonts.css');
59+
60+
should(indexHtml).containEql('background: url(\'local/style-attr.png\')');
61+
fs.existsSync(testDirname + '/local/style-attr.png').should.be.eql(true);
62+
should(fs.readFileSync(testDirname + '/local/style-attr.png').toString()).be.eql('style-attr.png');
63+
64+
should(indexHtml).containEql('img src="local/img.png');
65+
fs.existsSync(testDirname + '/local/img.png').should.be.eql(true);
66+
should(fs.readFileSync(testDirname + '/local/img.png').toString()).be.eql('img.png');
67+
68+
should(indexHtml).containEql('href="index_1.html"');
69+
fs.existsSync(testDirname + '/index_1.html').should.be.eql(true);
70+
should(fs.readFileSync(testDirname + '/index_1.html').toString()).be.eql('index_1.html');
71+
72+
fs.existsSync(testDirname + '/local/style.css').should.be.eql(true);
73+
var styleCss = fs.readFileSync(testDirname + '/local/style.css').toString();
74+
75+
should(styleCss).containEql('url(\'external-style.png\')');
76+
fs.existsSync(testDirname + '/local/external-style.png').should.be.eql(true);
77+
should(fs.readFileSync(testDirname + '/local/external-style.png').toString()).be.eql('external-style.png');
78+
});
79+
});
80+
});
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<link rel="stylesheet" type="text/css" href="style.css"/>
6+
<link rel="stylesheet" href="http://example.com/fonts?family=Myriad&amp;v=2" type="text/css" media="all" />
7+
</head>
8+
<body>
9+
<div style="background: url('http://example.com/style-attr.png?a=1&amp;style-attr.png')"></div>
10+
<img src="http://example.com/img.png?a=1&amp;b=2" />
11+
<a href="?b=2&amp;c=3&amp;d=4">test</a>
12+
</body>
13+
</html>
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.bg {
2+
background-image: url('http://example.com/external-style.png?v=2&amp;name=external-style.png');
3+
}

test/unit/filename-generator/by-site-structure-test.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ var bySiteStructureFilenameGenerator = require('../../../lib/filename-generator/
77

88
var options = { defaultFilename: 'index.html' };
99

10-
describe('byStructureFilenameGenerator', function() {
10+
describe('FilenameGenerator: bySiteStructure', function() {
1111
it('should return the normalized relative path of the resource url', function(){
1212
var r1 = new Resource('http://example.com/some/path/a.png');
1313
bySiteStructureFilenameGenerator(r1, options).should.equalFileSystemPath('some/path/a.png');

test/unit/filename-generator/by-type-test.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ var sinon = require('sinon');
55
var Resource = require('../../../lib/resource');
66
var byTypeFilenameGenerator = require('../../../lib/filename-generator/by-type');
77

8-
describe('byTypeFilenameGenerator', function() {
8+
describe('FilenameGenerator: byType', function() {
99
it('should return resource filename', function() {
1010
var r = new Resource('http://example.com/a.png', 'b.png');
1111
var filename = byTypeFilenameGenerator(r, {}, []);

0 commit comments

Comments
 (0)