Skip to content

Commit b2d2bed

Browse files
committed
Improve recognizing resource type (using parent resource & html data) + test
1 parent fea53a8 commit b2d2bed

File tree

20 files changed

+264
-39
lines changed

20 files changed

+264
-39
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
var types = require('./resource-types');
2+
3+
var typesByHtmlTag = {};
4+
5+
typesByHtmlTag[types.css] = [
6+
{ tagName: 'link', attributeName: 'href' }
7+
];
8+
typesByHtmlTag[types.html] = [
9+
{ tagName: 'a', attributeName: 'href' },
10+
{ tagName: 'iframe', attributeName: 'src' }
11+
];
12+
13+
module.exports = typesByHtmlTag;

lib/config/resource-types.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
var types = {
2+
css: 'css',
3+
html: 'html',
4+
other: 'other'
5+
};
6+
7+
module.exports = types;

lib/file-handlers/css.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ function loadCss (context, resource) {
1313
var promises = _.map(cssUrls, function loadResourceFromCssUrl (cssUrl) {
1414
var resourceUrl = utils.getUrl(url, cssUrl);
1515
var cssResource = new Resource(resourceUrl);
16+
cssResource.setParent(resource);
1617

1718
return context.loadResource(cssResource).then(function handleLoadedSource (loadedResource) {
1819
var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());

lib/file-handlers/html.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
var Promise = require('bluebird');
22
var cheerio = require('cheerio');
3-
var _ = require('underscore');
43
var utils = require('../utils');
54
var Resource = require('../resource');
65

@@ -10,7 +9,7 @@ function loadHtml (context, resource) {
109

1110
var p = beforeHandle(resource);
1211

13-
_.each(sources, function (src) {
12+
sources.forEach(function (src) {
1413
p = p.then(function loadSource () {
1514
return handleResources(src);
1615
});
@@ -52,6 +51,8 @@ function loadResources (context, resource, source) {
5251
if (attr) {
5352
var resourceUrl = utils.getUrl(url, attr);
5453
var htmlResource = new Resource(resourceUrl);
54+
htmlResource.setParent(resource);
55+
htmlResource.setHtmlData({ tagName: el[0].name, attributeName: source.attr });
5556

5657
return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) {
5758
var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());

lib/resource.js

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,14 @@
1+
var _ = require('underscore');
12
var path = require('path');
3+
var types = require('./config/resource-types');
4+
var typesByHtmlData = require('./config/resource-types-by-tag');
5+
6+
function getTypeByHtmlData (htmlData) {
7+
var type = _.findKey(typesByHtmlData, function containsHtmlData (rules) {
8+
return _.findWhere(rules, htmlData);
9+
});
10+
return type || types.other;
11+
}
212

313
function Resource (url, filename) {
414
this.url = url;
@@ -29,15 +39,35 @@ Resource.prototype.setText = function setText (text) {
2939
this.text = text;
3040
};
3141

42+
Resource.prototype.setParent = function setParent (parent) {
43+
this.parent = parent;
44+
};
45+
46+
/**
47+
*
48+
* @param {Object} data - html element data
49+
* @param {string} data.tagName - tag name which contain resource
50+
* @param {string} data.attributeName - attribute name with value of resource's url
51+
*/
52+
Resource.prototype.setHtmlData = function setHtmlData (data) {
53+
this.htmlData = data;
54+
};
55+
3256
Resource.prototype.getType = function getType () {
3357
var ext = path.extname(this.filename);
34-
switch (ext) {
35-
case '.css':
36-
return 'css';
37-
case '.html':
38-
return 'html';
58+
var parentType = this.parent && this.parent.getType();
59+
var hasHtmlData = !!this.htmlData;
60+
61+
switch (true) {
62+
case ext == '.html':
63+
return types.html;
64+
case ext == '.css':
65+
case !ext && parentType == types.css:
66+
return types.css;
67+
case !ext && parentType == types.html && hasHtmlData:
68+
return getTypeByHtmlData(this.htmlData);
3969
default:
40-
return 'other';
70+
return types.other;
4171
}
4272
};
4373

lib/scraper.js

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ var fs = Promise.promisifyAll(require('fs-extra'));
33
var path = require('path');
44
var _ = require('underscore');
55

6-
var defaults = require('./config/defaults.js');
6+
var defaults = require('./config/defaults');
7+
var types = require('./config/resource-types');
78
var utils = require('./utils.js');
89
var request = require('./request');
910
var Resource = require('./resource');
@@ -37,8 +38,8 @@ function generateFilename (resource) {
3738
var occupiedFilenames = self.getOccupiedFilenames();
3839

3940
var preferedFilename = resource.getFilename(); // which was set in options
40-
var urlFilename = path.basename(resource.getUrl()); // try to get filename from url
41-
var filename = utils.trimFilename(preferedFilename || urlFilename);
41+
var urlFilename = utils.getFilenameFromUrl(resource.getUrl()); // try to get filename from url
42+
var filename = preferedFilename || urlFilename || self.options.defaultFilename;
4243

4344
var ext = path.extname(filename);
4445
var dir = self.getDirectoryByExtension(ext);
@@ -64,8 +65,8 @@ function getDirectoryByExtension (ext) {
6465
function getHandleFunction (resource) {
6566
var type = resource.getType();
6667
switch (type) {
67-
case 'css': return loadCss;
68-
case 'html': return function loadHtmlAndCss (context, po) {
68+
case types.css: return loadCss;
69+
case types.html: return function loadHtmlAndCss (context, po) {
6970
return loadHtml(context, po).then(function (loaded) {
7071
return loadCss(context, loaded);
7172
});

lib/utils.js

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,6 @@
11
var url = require('url');
22
var path = require('path');
33

4-
function trimFilename(filename) {
5-
var queryRegexp = /[\?#](.*)$/;
6-
return filename.replace(queryRegexp, '');
7-
}
8-
94
function isUrl(path) {
105
var urlRegexp = /^((http[s]?:)?\/\/)/;
116
return urlRegexp.test(path);
@@ -30,8 +25,14 @@ function getRelativePath(path1, path2) {
3025
return getUnixPath(relativePath);
3126
}
3227

33-
module.exports.isUrl = isUrl;
34-
module.exports.getUrl = getUrl;
35-
module.exports.getUnixPath = getUnixPath;
36-
module.exports.trimFilename = trimFilename;
37-
module.exports.getRelativePath = getRelativePath;
28+
function getFilenameFromUrl (u) {
29+
return path.basename(url.parse(u).pathname);
30+
}
31+
32+
module.exports = {
33+
isUrl: isUrl,
34+
getUrl: getUrl,
35+
getUnixPath: getUnixPath,
36+
getRelativePath: getRelativePath,
37+
getFilenameFromUrl: getFilenameFromUrl
38+
};

test/functional/complex-test.js renamed to test/functional/base-test.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ var _ = require('underscore');
66
var cheerio = require('cheerio');
77
var scraper = require('../../index');
88

9-
var testDirname = __dirname + '/.complex';
10-
var mockDirname = __dirname + '/mocks/complex';
9+
var testDirname = __dirname + '/.base';
10+
var mockDirname = __dirname + '/mocks/base';
1111

12-
describe('Functional', function() {
12+
describe('Functional base', function() {
1313

1414
beforeEach(function() {
1515
nock.cleanAll();
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)