Skip to content

Commit f9d554e

Browse files
authored
Merge pull request #417 from tomas/handle-unknown-encodings
Don't explode or unknown encodings. Fixes #410
2 parents 3f8a7e3 + f552069 commit f9d554e

File tree

4 files changed

+859
-38
lines changed

4 files changed

+859
-38
lines changed

lib/decoder.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,16 @@ StreamDecoder.prototype._transform = function(chunk, encoding, done) {
2121
this.parsed_chunk = true;
2222

2323
var matches = regex.exec(chunk.toString());
24+
2425
if (matches) {
2526
var found = matches[1].toLowerCase().replace('utf8', 'utf-8'); // canonicalize;
2627
// set charset, but only if iconv can handle it
2728
if (iconv.encodingExists(found)) this.charset = found;
2829
}
2930
}
3031

31-
if (this.charset == 'utf-8') { // no need to decode, just pass through
32+
// if charset is already utf-8 or given encoding isn't supported, just pass through
33+
if (this.charset == 'utf-8' || !iconv.encodingExists(this.charset)) {
3234
this.push(chunk);
3335
return done();
3436
}

test/decoder_spec.js

Lines changed: 59 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,31 @@ describe('character encoding', function() {
1111

1212
this.timeout(5000);
1313

14-
describe('Given content-type: "text/html; charset=EUC-JP"', function() {
15-
16-
var port = 2233;
17-
var server;
18-
19-
function createServer() {
20-
return http.createServer(function(req, res) {
21-
22-
req.on('data', function(chunk) {})
23-
24-
req.on('end', function() {
25-
// We used to pull from a particular site that is no longer up.
26-
// This is a local mirror pulled from archive.org
27-
// https://web.archive.org/web/20181003202907/http://www.nina.jp/server/slackware/webapp/tomcat_charset.html
28-
fs.readFile('test/tomcat_charset.html', function(err, data) {
29-
if (err) {
30-
res.writeHead(404);
31-
res.end(JSON.stringify(err));
32-
return;
33-
}
34-
res.writeHeader(200, { 'Content-Type': 'text/html; charset=EUC-JP' })
35-
res.end(data);
36-
});
37-
})
38-
14+
function staticServerFor(file, content_type) {
15+
return http.createServer(function(req, res) {
16+
req.on('data', function(chunk) {})
17+
req.on('end', function() {
18+
// We used to pull from a particular site that is no longer up.
19+
// This is a local mirror pulled from archive.org
20+
// https://web.archive.org/web/20181003202907/http://www.nina.jp/server/slackware/webapp/tomcat_charset.html
21+
fs.readFile(file, function(err, data) {
22+
if (err) {
23+
res.writeHead(404);
24+
res.end(JSON.stringify(err));
25+
return;
26+
}
27+
res.writeHeader(200, { 'Content-Type': content_type })
28+
res.end(data);
29+
});
3930
})
40-
}
31+
})
32+
}
33+
34+
describe('Given content-type: "text/html; charset=EUC-JP"', function() {
35+
var server, port = 2233;
4136

4237
before(function(done) {
43-
server = createServer();
38+
server = staticServerFor('test/files/tomcat_charset.html', 'text/html; charset=EUC-JP')
4439
server.listen(port, done)
4540
url = 'http://localhost:' + port;
4641
})
@@ -50,35 +45,26 @@ describe('character encoding', function() {
5045
})
5146

5247
describe('with decode = false', function() {
53-
5448
it('does not decode', function(done) {
55-
5649
needle.get(url, { decode: false }, function(err, resp) {
5750
resp.body.should.be.a.String;
5851
chardet.detect(resp.body).encoding.should.eql('windows-1252');
5952
resp.body.indexOf('EUCを使う').should.eql(-1);
6053
done();
6154
})
62-
6355
})
64-
6556
})
6657

6758
describe('with decode = true', function() {
68-
6959
it('decodes', function(done) {
70-
7160
needle.get(url, { decode: true }, function(err, resp) {
7261
resp.body.should.be.a.String;
7362
chardet.detect(resp.body).encoding.should.eql('ascii');
7463
resp.body.indexOf('EUCを使う').should.not.eql(-1);
7564
done();
7665
})
77-
7866
})
79-
8067
})
81-
8268
})
8369

8470
describe('Given content-type: "text/html but file is charset: gb2312', function() {
@@ -119,6 +105,43 @@ describe('character encoding', function() {
119105
})
120106
})
121107

108+
describe('Given content-type: text/html; charset=maccentraleurope', function() {
109+
var server, port = 2233;
110+
111+
// from 'https://wayback.archive-it.org/3259/20160921140616/https://www.arc.gov/research/MapsofAppalachia.asp?MAP_ID=11';
112+
before(function(done) {
113+
server = staticServerFor('test/files/Appalachia.html', 'text/html; charset=maccentraleurope')
114+
server.listen(port, done)
115+
url = 'http://localhost:' + port;
116+
})
117+
118+
after(function(done) {
119+
server.close(done)
120+
})
121+
122+
describe('with decode = false', function() {
123+
it('does not decode', function(done) {
124+
needle.get(url, { decode: false }, function(err, resp) {
125+
resp.body.should.be.a.String;
126+
chardet.detect(resp.body).encoding.should.eql('ascii');
127+
done();
128+
})
129+
})
130+
})
131+
132+
describe('with decode = true', function() {
133+
it('does not explode', function(done) {
134+
(function() {
135+
needle.get(url, { decode: true }, function(err, resp) {
136+
resp.body.should.be.a.String;
137+
chardet.detect(resp.body).encoding.should.eql('ascii');
138+
done();
139+
})
140+
}).should.not.throw();
141+
})
142+
})
143+
})
144+
122145
describe('Given content-type: "text/html"', function () {
123146

124147
var server,
@@ -150,7 +173,6 @@ describe('character encoding', function() {
150173
})
151174

152175
})
153-
154176
})
155177

156178
describe('multibyte characters split across chunks', function () {

0 commit comments

Comments
 (0)