Skip to content

Commit 6bb4e20

Browse files
authored
Use encoding from resource text (#504)
1 parent 07c4c02 commit 6bb4e20

File tree

13 files changed

+156
-19
lines changed

13 files changed

+156
-19
lines changed

.eslintrc.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
extends: "eslint:recommended"
22
parserOptions:
3-
ecmaVersion: 8
3+
ecmaVersion: 2020
44
sourceType: "module"
55
env:
66
node: true

lib/resource-handler/css/index.js

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import CssText from './../path-containers/css-text.js';
2+
import { getCharsetFromCss, updateResourceEncoding } from '../../utils/index.js';
23

34
class CssResourceHandler {
45
constructor (options, methods) {
@@ -7,12 +8,21 @@ class CssResourceHandler {
78
this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources);
89
}
910

10-
handle (resource) {
11+
async handle (resource) {
12+
prepareToLoad(resource);
13+
1114
const pathContainer = new CssText(resource.getText());
12-
return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) {
13-
resource.setText(updatedText);
14-
return resource;
15-
});
15+
16+
const updatedText = await this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources);
17+
resource.setText(updatedText);
18+
return resource;
19+
}
20+
}
21+
22+
function prepareToLoad (resource) {
23+
const charset = getCharsetFromCss(resource.getText());
24+
if (charset && charset === 'utf-8') { // do we need to support more charsets here?
25+
updateResourceEncoding(resource, 'utf8');
1626
}
1727
}
1828

lib/resource-handler/html/index.js

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import cheerio from 'cheerio';
2-
import { union, getUrl, series } from '../../utils/index.js';
2+
import { union, getUrl, series, updateResourceEncoding } from '../../utils/index.js';
33
import logger from '../../logger.js';
44
import HtmlSourceElement from './html-source-element.js';
55

@@ -23,9 +23,8 @@ class HtmlResourceHandler {
2323
}
2424

2525
async handle (resource) {
26+
prepareToLoad(resource);
2627
const $ = loadTextToCheerio(resource.getText());
27-
prepareToLoad($, resource);
28-
2928
const sourceRulesLoadPromises = this.allSources.map(
3029
rule => this.loadResourcesForRule.bind(this, $, resource, rule)
3130
);
@@ -68,16 +67,29 @@ class HtmlResourceHandler {
6867
}
6968
}
7069

71-
function prepareToLoad ($, resource) {
72-
$('base').each((i, element) => {
70+
function prepareToLoad (resource) {
71+
const $ = loadTextToCheerio(resource.getText());
72+
73+
$('base[href]').each((i, element) => {
7374
const el = $(element);
7475
const href = el.attr('href');
7576
if (href) {
7677
const newUrl = getUrl(resource.getUrl(), href);
78+
logger.debug(`<base> tag found in resource ${resource}, changing url to ${newUrl}`);
7779
resource.setUrl(newUrl);
80+
7881
el.remove();
82+
resource.setText($.html());
7983
}
8084
});
85+
86+
$('meta[charset]').each((i, element) => {
87+
const el = $(element);
88+
const charset = el.attr('charset')?.toLowerCase();
89+
if (charset && charset === 'utf-8') { // utf-8 is the only valid value for html5 documents
90+
updateResourceEncoding(resource, 'utf8');
91+
}
92+
});
8193
}
8294

8395
function loadTextToCheerio (text) {

lib/resource.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ class Resource {
8787
}
8888

8989
toString () {
90-
return '{ url: "' + this.getUrl() + '", filename: "' + this.getFilename() + '", depth: ' + this.getDepth() + ' }';
90+
return `{ url: "${this.getUrl()}", filename: "${this.getFilename()}", depth: ${this.getDepth()}, type: "${this.getType()}" }`;
9191
}
9292

9393
isSaved () {

lib/scraper.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ class Scraper {
170170
self.requestedResourcePromises.set(responseData.url, requestPromise);
171171
}
172172

173-
resource.setEncoding(responseData.encoding);
174173
resource.setType(getTypeByMime(responseData.mimeType));
175174

176175
const { filename } = await self.runActions('generateFilename', { resource, responseData });
@@ -185,7 +184,9 @@ class Scraper {
185184
resource.setMetadata(responseData.metadata);
186185
}
187186

187+
resource.setEncoding(responseData.encoding);
188188
resource.setText(responseData.body);
189+
189190
self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad
190191
return resource;
191192
}).catch(function handleError (err) {

lib/utils/index.js

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,32 @@ async function series (promises) {
161161
return results;
162162
}
163163

164+
function getCharsetFromCss (cssText) {
165+
const CHARSET_REGEXP = /(?:@charset\s)(("(.*?)")|('(.*?)'))[\s;]/;
166+
const hasCharset = cssText.startsWith('@charset');
167+
168+
if (hasCharset) {
169+
const charsetMatch = CHARSET_REGEXP.exec(cssText);
170+
const charset = charsetMatch?.[3] || charsetMatch?.[5];
171+
return charset?.toLowerCase() ?? null;
172+
} else {
173+
return null;
174+
}
175+
}
176+
177+
function updateResourceEncoding (resource, encoding) {
178+
logger.debug(`updating encoding of resource ${resource} to ${encoding}`);
179+
180+
const resourceText = resource.getText();
181+
182+
if (resourceText) {
183+
const updatedText = Buffer.from(resourceText, resource.getEncoding()).toString(encoding);
184+
resource.setText(updatedText);
185+
}
186+
187+
resource.setEncoding(encoding);
188+
}
189+
164190
export {
165191
isUrl,
166192
getUrl,
@@ -181,5 +207,7 @@ export {
181207
extend,
182208
union,
183209
isPlainObject,
184-
series
210+
series,
211+
getCharsetFromCss,
212+
updateResourceEncoding
185213
};

test/functional/encoding/hieroglyphs.test.js renamed to test/functional/encoding/encoding.test.js

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import scrape from 'website-scraper';
66
const testDirname = './test/functional/encoding/.tmp';
77
const mockDirname = './test/functional/encoding/mocks';
88

9-
describe('Functional: UTF8 characters are properly encoded/decoded', () => {
9+
describe('Functional: encoding', () => {
1010
const options = {
1111
urls: [
1212
'http://example.com/',
@@ -26,17 +26,29 @@ describe('Functional: UTF8 characters are properly encoded/decoded', () => {
2626
await fs.rm(testDirname, { recursive: true, force: true });
2727
});
2828

29-
beforeEach(() => {
30-
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'});
29+
it('should save the page with enconding from http response headers', async () => {
30+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/without-charset.html', {'content-type': 'text/html; charset=utf-8'});
31+
32+
await scrape(options);
33+
34+
const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' });
35+
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
36+
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
37+
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
38+
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Обладнання та ПЗ</div>');
39+
scrapedIndex.should.be.containEql('<div id="special-characters-french">PAR PASSION DU VÉLO</div>');
3140
});
3241

33-
it('should save the page in the same data as it was originally', async () => {
42+
it('should save the page with enconding from html meta tag', async () => {
43+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/with-charset.html', {'content-type': 'text/html'});
44+
3445
await scrape(options);
3546

3647
const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' });
3748
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
3849
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
3950
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
4051
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Обладнання та ПЗ</div>');
52+
scrapedIndex.should.be.containEql('<div id="special-characters-french">PAR PASSION DU VÉLO</div>');
4153
});
4254
});

test/functional/encoding/mocks/index.html renamed to test/functional/encoding/mocks/with-charset.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@
99
<div id="special-characters-ukrainian">Слава Україні!</div>
1010
<div id="special-characters-chinese">加入网站</div>
1111
<div id="special-characters-ukrainian">Обладнання та ПЗ</div>
12+
<div id="special-characters-french">PAR PASSION DU VÉLO</div>
1213
</body>
1314
</html>
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<title>Test</title>
5+
</head>
6+
<body>
7+
<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>
8+
<div id="special-characters-ukrainian">Слава Україні!</div>
9+
<div id="special-characters-chinese">加入网站</div>
10+
<div id="special-characters-ukrainian">Обладнання та ПЗ</div>
11+
<div id="special-characters-french">PAR PASSION DU VÉLO</div>
12+
</body>
13+
</html>

test/unit/plugins.test.js

Whitespace-only changes.

0 commit comments

Comments
 (0)