Skip to content

Commit 5a58f48

Browse files
authored
Fix encoding issue for non-English websites, closes #454 #466 (#482)
1 parent d80e9b0 commit 5a58f48

File tree

19 files changed

+183
-39
lines changed

19 files changed

+183
-39
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ package-lock.json
55
npm-debug.log
66
coverage
77
test/e2e/results
8+
.nyc-output

lib/config/defaults.js

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ const config = {
4848
],
4949
request: {
5050
throwHttpErrors: false,
51-
encoding: 'binary',
52-
//cookieJar: true,
51+
responseType: 'buffer',
5352
decompress: true,
5453
headers: {
5554
'user-agent': defaultRequestUserAgent

lib/plugins/save-resource-to-fs-plugin.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ class SaveResourceToFileSystemPlugin {
2020
registerAction('saveResource', async ({resource}) => {
2121
const filename = path.join(absoluteDirectoryPath, resource.getFilename());
2222
const text = resource.getText();
23-
await fs.outputFile(filename, text, { encoding: 'binary' });
23+
const encoding = typeof text === 'string' ? 'utf-8' : 'binary';
24+
await fs.outputFile(filename, text, { encoding });
2425
loadedResources.push(resource);
2526
});
2627

lib/request.js

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,24 @@
11
import got from 'got';
22
import logger from './logger.js';
3-
import { extend, isPlainObject } from './utils/index.js';
3+
import types from './config/resource-types.js';
4+
import { extend, isPlainObject, getTypeByMime } from './utils/index.js';
5+
6+
const TEXT_RESOURCE_TYPES = [types.html, types.css];
47

58
function getMimeType (contentType) {
69
return contentType ? contentType.split(';')[0] : null;
710
}
811

9-
function defaultResponseHandler ({response}) {
10-
return Promise.resolve(response.body);
12+
function defaultResponseHandler ({response, type}) {
13+
if (TEXT_RESOURCE_TYPES.includes(type)) {
14+
return response.body.toString();
15+
}
16+
return response.body;
1117
}
1218

1319
function transformResult (result) {
1420
switch (true) {
15-
case typeof result === 'string':
21+
case typeof result === 'string' || Buffer.isBuffer(result):
1622
return {
1723
body: result,
1824
metadata: null
@@ -41,14 +47,19 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR
4147

4248
const response = await got(requestOptions);
4349
logger.debug(`[request] received response for ${response.url}, statusCode ${response.statusCode}`);
44-
const responseHandlerResult = transformResult(await afterResponse({response}));
50+
51+
const mimeType = getMimeType(response.headers['content-type']);
52+
const resourceType = getTypeByMime(mimeType);
53+
54+
const responseHandlerResult = transformResult(await afterResponse({ response, type: resourceType }));
4555

4656
if (!responseHandlerResult) {
4757
return null;
4858
}
4959
return {
5060
url: response.url,
51-
mimeType: getMimeType(response.headers['content-type']),
61+
type: resourceType,
62+
mimeType,
5263
body: responseHandlerResult.body,
5364
metadata: responseHandlerResult.metadata
5465
};

lib/scraper.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import {
1313
} from './plugins/index.js';
1414

1515
import * as utils from './utils/index.js';
16-
const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils;
16+
const { extend, union, urlsEqual, getTypeByFilename, series } = utils;
1717
import NormalizedUrlMap from './utils/normalized-url-map.js';
1818

1919
const actionNames = [
@@ -170,7 +170,7 @@ class Scraper {
170170
self.requestedResourcePromises.set(responseData.url, requestPromise);
171171
}
172172

173-
resource.setType(getTypeByMime(responseData.mimeType));
173+
resource.setType(responseData.type);
174174

175175
const { filename } = await self.runActions('generateFilename', { resource, responseData });
176176
resource.setFilename(filename);

test/functional/base/base.test.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,15 @@ describe('Functional: base', function() {
5151
nock('http://blog.example.com/').get('/').replyWithFile(200, mockDirname + '/blog.html', {'content-type': 'text/html'});
5252

5353
// mock sources for index.html
54-
nock('http://example.com/').get('/index.css').replyWithFile(200, mockDirname + '/index.css');
54+
nock('http://example.com/').get('/index.css').replyWithFile(200, mockDirname + '/index.css', {'content-type': 'text/css'});
5555
nock('http://example.com/').get('/background.png').reply(200, 'OK');
5656
nock('http://example.com/').get('/cat.jpg').reply(200, 'OK');
5757
nock('http://example.com/').get('/script.min.js').reply(200, 'OK');
5858

5959
// mock sources for index.css
60-
nock('http://example.com/').get('/files/index-import-1.css').reply(200, 'OK');
61-
nock('http://example.com/').get('/files/index-import-2.css').replyWithFile(200, mockDirname + '/index-import-2.css');
62-
nock('http://example.com/').get('/files/index-import-3.css').reply(200, 'OK');
60+
nock('http://example.com/').get('/files/index-import-1.css').reply(200, 'OK', {'content-type': 'text/css'});
61+
nock('http://example.com/').get('/files/index-import-2.css').replyWithFile(200, mockDirname + '/index-import-2.css', {'content-type': 'text/css'});
62+
nock('http://example.com/').get('/files/index-import-3.css').reply(200, 'OK', {'content-type': 'text/css'});
6363
nock('http://example.com/').get('/files/index-image-1.png').reply(200, 'OK');
6464
nock('http://example.com/').get('/files/index-image-2.png').reply(200, 'OK');
6565

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import should from 'should';
2+
import '../../utils/assertions.js';
3+
import nock from 'nock';
4+
import fs from 'fs-extra';
5+
import cheerio from 'cheerio';
6+
import scrape from 'website-scraper';
7+
8+
const testDirname = './test/functional/binary-resources/.tmp';
9+
const mockDirname = './test/functional/binary-resources/mocks';
10+
11+
describe('Functional: images', () => {
12+
const options = {
13+
urls: [ 'http://example.com/' ],
14+
directory: testDirname,
15+
subdirectories: [
16+
{ directory: 'img', extensions: ['.jpg', '.png'] }
17+
],
18+
sources: [
19+
{ selector: 'img', attr: 'src' }
20+
],
21+
ignoreErrors: false
22+
};
23+
24+
beforeEach(() => {
25+
nock.cleanAll();
26+
nock.disableNetConnect();
27+
});
28+
29+
afterEach(() => {
30+
nock.cleanAll();
31+
nock.enableNetConnect();
32+
fs.removeSync(testDirname);
33+
});
34+
35+
beforeEach(() => {
36+
// mock base urls
37+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
38+
39+
// mock sources for index.html
40+
nock('http://example.com/').get('/test-image.png').replyWithFile(200, mockDirname + '/test-image.png', {'content-type': 'image/png'});
41+
nock('http://example.com/').get('/test-image.jpg').replyWithFile(200, mockDirname + '/test-image.jpg', {'content-type': 'image/jpeg'});
42+
});
43+
44+
it('should load images and save content correctly', async () => {
45+
await scrape(options);
46+
47+
// should create directory and subdirectories
48+
fs.existsSync(testDirname).should.be.eql(true);
49+
fs.existsSync(testDirname + '/img').should.be.eql(true);
50+
51+
// should contain all sources found in index.html
52+
fs.existsSync(testDirname + '/img/test-image.png').should.be.eql(true);
53+
fs.existsSync(testDirname + '/img/test-image.jpg').should.be.eql(true);
54+
55+
// all sources in index.html should be replaced with local paths
56+
let $ = cheerio.load(fs.readFileSync(testDirname + '/index.html').toString());
57+
$('img.png').attr('src').should.be.eql('img/test-image.png');
58+
$('img.jpg').attr('src').should.be.eql('img/test-image.jpg');
59+
60+
// content of downloaded images should equal original images
61+
const originalPng = fs.readFileSync(mockDirname + '/test-image.png');
62+
const originalJpg = fs.readFileSync(mockDirname + '/test-image.jpg');
63+
const resultPng = fs.readFileSync(testDirname + '/img/test-image.png');
64+
const resultJpg = fs.readFileSync(testDirname + '/img/test-image.jpg');
65+
66+
should(resultPng).be.eql(originalPng);
67+
should(resultJpg).be.eql(originalJpg);
68+
});
69+
});
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Index</title>
6+
</head>
7+
<body>
8+
<img class="jpg" src="/test-image.jpg" />
9+
<img class="png" src="/test-image.png" />
10+
</body>
11+
</html>
15.4 KB
Loading
11.9 KB
Loading

0 commit comments

Comments
 (0)