Skip to content

Commit fabd8d2

Browse files
committed
refactor!: Update code
Signed-off-by: Richie Bendall <[email protected]>
1 parent 2a9c557 commit fabd8d2

File tree

12 files changed

+109
-682
lines changed

12 files changed

+109
-682
lines changed

README.md

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Fetch Charset Detection [![Travis CI Build Status](https://img.shields.io/travis/com/Richienb/fetch-charset-detection/master.svg?style=for-the-badge)](https://travis-ci.com/Richienb/fetch-charset-detection)
22

3-
Charset detection and conversion utilities, originally from `node-fetch`.
3+
Charset detection and conversion, originally from `node-fetch`.
44

55
[![NPM](https://nodei.co/npm/fetch-charset-detection.png?downloads=true&downloadRank=true&stars=true)](https://nodei.co/npm/fetch-charset-detection)
66

@@ -9,12 +9,7 @@ Charset detection and conversion utilities, originally from `node-fetch`.
99
From your NodeJS application:
1010

1111
```js
12-
const {
13-
convertBody,
14-
extractContentType,
15-
getTotalBytes,
16-
writeToStream
17-
} = require("fetch-charset-detection");
12+
const convertBody = require("fetch-charset-detection");
1813
```
1914

2015
## API

package.json

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"dev": "yarn js --watch",
2222
"build": "yarn js && yarn docs",
2323
"js": "tsc",
24-
"docs": "typedoc --out ./docs --mode file --target ES6 --ignoreCompilerErrors ./src",
24+
"docs": "typedoc",
2525
"lint": "xo",
2626
"test": "ava"
2727
},
@@ -37,23 +37,18 @@
3737
"@types/content-type": "^1.1.3",
3838
"@types/lodash": "^4.14.146",
3939
"ava": "^2.4.0",
40-
"xo": "^0.25.3",
4140
"eslint-config-richienb": "^0.2.2",
42-
"express": "^4.17.1",
43-
"fetch-blob": "^1.0.4",
44-
"form-data": "^3.0.0",
45-
"get-port": "^5.0.0",
4641
"node-fetch": "^2.6.0",
47-
"resumer": "^0.0.0",
4842
"ts-node": "^8.4.1",
4943
"typedoc": "^0.15.0",
50-
"typescript": "^3.7.2"
44+
"typescript": "^3.7.2",
45+
"xo": "^0.25.3"
5146
},
5247
"resolutions": {
5348
"eslint": "^6.6.0"
5449
},
5550
"xo": {
56-
"extends": "richienb/node",
51+
"extends": "richienb/ts",
5752
"overrides": [
5853
{
5954
"files": "test.js",

src/index.ts

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,51 @@
2424
* SOFTWARE.
2525
*/
2626

27-
export { convertBody } from "./lib/convert-body"
28-
export { extractContentType } from "./lib/extract-content-type"
29-
export { getTotalBytes } from "./lib/get-total-bytes"
30-
export { writeToStream } from "./lib/write-to-stream"
27+
import getCharset from "./utils/get-charset"
28+
import { decode } from "iconv-lite"
29+
import { load as $ } from "cheerio"
30+
import _ from "lodash"
31+
32+
/**
33+
* Detect buffer encoding and convert to target encoding
34+
* ref: http://www.w3.org/TR/2011/WD-html5-20110113/parsing.html#determining-the-character-encoding
35+
*
36+
* @param content The content to convert.
37+
* @param headers HTTP Headers provided with a request.
38+
*/
39+
export default function convertBody(content: Buffer | string, headers?: Headers): string {
40+
// Try to extract content-type header
41+
const contentType = !_.isNil(headers) ? headers.get("content-type") : null
42+
43+
// Resulting charset
44+
let charset: string
45+
46+
// Convert to buffer
47+
if (_.isString(content)) content = Buffer.from(content)
48+
49+
// Header
50+
if (contentType) charset = getCharset(contentType)
51+
52+
// No charset in content type, peek at response body for at most 1024 bytes
53+
const res = _.toString(content.slice(0, 1024))
54+
55+
// HTML5, HTML4 and XML
56+
if (!charset && res) {
57+
charset = getCharset(
58+
$(res)("meta[charset]").attr("charset") || // HTML5
59+
$(res)("meta[http-equiv][content]").attr("content") || // HTML4
60+
$(_.replace(res, /<\?(.*)\?>/im, "<$1>"), { xmlMode: true }).root().find("xml").attr("encoding"), // XML
61+
)
62+
}
63+
64+
// Prevent decode issues when sites use incorrect encoding
65+
// ref: https://hsivonen.fi/encoding-menu/
66+
if (charset && _.includes(["gb2312", "gbk"], _.lowerCase(charset))) charset = "gb18030"
67+
68+
// Turn raw buffers into a single utf-8 buffer
69+
return decode(
70+
content,
71+
charset || "utf-8",
72+
)
73+
}
74+

src/lib/convert-body.ts

Lines changed: 0 additions & 41 deletions
This file was deleted.

src/lib/extract-content-type.ts

Lines changed: 0 additions & 38 deletions
This file was deleted.

src/lib/get-total-bytes.ts

Lines changed: 0 additions & 27 deletions
This file was deleted.

src/lib/write-to-stream.ts

Lines changed: 0 additions & 25 deletions
This file was deleted.

src/utils/get-charset.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import niceTry from "nice-try"
55
/**
66
* Get the character set from a Content-Type header.
77
* @param contentType The Content-Type HTTP header.
8+
* @private
89
*/
910
export default function getCharset(contentType: string): string | null {
1011
if (_.isNil(contentType)) return null

src/utils/is.ts

Lines changed: 0 additions & 87 deletions
This file was deleted.

0 commit comments

Comments
 (0)