Skip to content

Commit f7ad40e

Browse files
committed
#pull 207
1 parent 81a2e2f commit f7ad40e

File tree

4 files changed

+104
-18
lines changed

4 files changed

+104
-18
lines changed

README.md

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,20 @@ npm install --save node-html-parser
1919
2020
## Performance
2121

22+
-- 2022-08-10
23+
2224
```shell
23-
html-parser :24.2329 ms/file ± 18.8092
24-
htmljs-parser :4.78952 ms/file ± 5.50403
25-
html-dom-parser :2.19594 ms/file ± 3.07470
26-
html5parser :1.72007 ms/file ± 2.22713
27-
cheerio :12.2220 ms/file ± 8.14063
28-
parse5 :6.77691 ms/file ± 4.12002
29-
htmlparser2 :2.33526 ms/file ± 3.43847
30-
htmlparser :17.6260 ms/file ± 122.314
31-
high5 :3.85676 ms/file ± 2.48878
32-
node-html-parser:2.04585 ms/file ± 1.23787
33-
node-html-parser (last release):2.00236 ms/file ± 1.22263
25+
html-parser :24.1595 ms/file ± 18.7667
26+
htmljs-parser :4.72064 ms/file ± 5.67689
27+
html-dom-parser :2.18055 ms/file ± 2.96136
28+
html5parser :1.69639 ms/file ± 2.17111
29+
cheerio :12.2122 ms/file ± 8.10916
30+
parse5 :6.50626 ms/file ± 4.02352
31+
htmlparser2 :2.38179 ms/file ± 3.42389
32+
htmlparser :17.4820 ms/file ± 128.041
33+
high5 :3.95188 ms/file ± 2.52313
34+
node-html-parser:2.04288 ms/file ± 1.25203
35+
node-html-parser (last release):2.00527 ms/file ± 1.21317
3436
```
3537

3638
Tested with [htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark).
@@ -82,6 +84,10 @@ Parse the data provided, and return the root of the generated DOM.
8284
{
8385
lowerCaseTagName: false, // convert tag name to lower case (hurts performance heavily)
8486
comment: false, // retrieve comments (hurts performance slightly)
87+
voidTag:{
88+
tags: ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'], // optional and case insensitive, default value is ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']
89+
addClosingSlash: true // optional, default false. void tag serialisation, add a final slash <br/>
90+
},
8591
blockTextElements: {
8692
script: true, // keep text content when parsing
8793
noscript: true, // keep text content when parsing

src/nodes/html.ts

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,12 @@ import { selectAll, selectOne } from 'css-select';
22
import he from 'he';
33
import arr_back from '../back';
44
import Matcher from '../matcher';
5+
import VoidTag from '../void-tag';
56
import CommentNode from './comment';
67
import Node from './node';
78
import TextNode from './text';
89
import NodeType from './type';
910

10-
const voidTags = new Set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']);
11-
1211
type IRawTagName =
1312
| 'LI'
1413
| 'P'
@@ -173,7 +172,8 @@ export default class HTMLElement extends Node {
173172
keyAttrs: KeyAttributes,
174173
public rawAttrs = '',
175174
parentNode: HTMLElement | null,
176-
range?: [number, number]
175+
range: [number, number],
176+
private voidTag = new VoidTag()
177177
) {
178178
super(parentNode, range);
179179
this.rawTagName = tagName;
@@ -237,7 +237,7 @@ export default class HTMLElement extends Node {
237237
}
238238

239239
public get isVoidElement() {
240-
return voidTags.has(this.localName);
240+
return this.voidTag.isVoidElement(this.localName);
241241
}
242242

243243
/**
@@ -313,7 +313,7 @@ export default class HTMLElement extends Node {
313313
const tag = this.rawTagName;
314314
if (tag) {
315315
const attrs = this.rawAttrs ? ` ${this.rawAttrs}` : '';
316-
return this.isVoidElement ? `<${tag}${attrs}>` : `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
316+
return this.voidTag.formatNode(tag, attrs, this.innerHTML);
317317
}
318318
return this.innerHTML;
319319
}
@@ -986,6 +986,16 @@ export interface Options {
986986
blockTextElements: {
987987
[tag: string]: boolean;
988988
};
989+
voidTag?: {
990+
/**
991+
* options, default value is ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']
992+
*/
993+
tags?: string[];
994+
/**
995+
* void tag serialisation, add a final slash <br/>
996+
*/
997+
closingSlash?: boolean;
998+
}
989999
}
9901000

9911001
const frameflag = 'documentfragmentcontainer';
@@ -997,6 +1007,7 @@ const frameflag = 'documentfragmentcontainer';
9971007
* @return {HTMLElement} root element
9981008
*/
9991009
export function base_parse(data: string, options = { lowerCaseTagName: false, comment: false } as Partial<Options>) {
1010+
const voidTag = new VoidTag(options?.voidTag?.closingSlash, options?.voidTag?.tags);
10001011
const elements = options.blockTextElements || {
10011012
script: true,
10021013
noscript: true,
@@ -1016,7 +1027,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
10161027
}
10171028

10181029
const createRange = (startPos: number, endPos: number): [number, number] => [startPos - frameFlagOffset, endPos - frameFlagOffset];
1019-
const root = new HTMLElement(null, {}, '', null, [0, data.length]);
1030+
const root = new HTMLElement(null, {}, '', null, [0, data.length], voidTag);
10201031

10211032
let currentParent = root;
10221033
const stack = [root];
@@ -1099,7 +1110,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
10991110

11001111
currentParent = currentParent.appendChild(
11011112
// Initialize range (end position updated later for closed tags)
1102-
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos, tagEndPos))
1113+
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos, tagEndPos), voidTag)
11031114
);
11041115
stack.push(currentParent);
11051116

src/void-tag.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
export default class VoidTag {
2+
private voidTags: Set<string>;
3+
public constructor(
4+
public addClosingSlash = false,
5+
tags?: string[]
6+
) {
7+
if (Array.isArray(tags)) {
8+
this.voidTags = tags.reduce((set, tag) => {
9+
return set.add(tag.toLowerCase());
10+
}, new Set<string>());
11+
} else {
12+
this.voidTags = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'].reduce((set, tag) => {
13+
return set.add(tag);
14+
}, new Set<string>());
15+
}
16+
}
17+
public formatNode(tag: string, attrs: string, innerHTML: string) {
18+
const addClosingSlash = this.addClosingSlash;
19+
const closingSpace = (addClosingSlash && attrs && !attrs.endsWith(' ')) ? ' ' : '';
20+
const closingSlash = addClosingSlash ? `${closingSpace}/` : '';
21+
return this.isVoidElement(tag.toLowerCase()) ? `<${tag}${attrs}${closingSlash}>` : `<${tag}${attrs}>${innerHTML}</${tag}>`;
22+
}
23+
public isVoidElement(tag: string) {
24+
return this.voidTags.has(tag);
25+
}
26+
}

test/tests/issues/207.js

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
const { parse } = require('@test/test-target');
2+
3+
// https://github.com/taoqf/node-html-parser/issues/206
4+
describe('void tags', function () {
5+
it('default', function () {
6+
const root = parse('<div><br></div>');
7+
root.toString().should.eql('<div><br></div>');
8+
});
9+
it('closingSlash', function () {
10+
const root = parse('<div><br></div>', {
11+
voidTag: {
12+
closingSlash: true
13+
}
14+
});
15+
root.toString().should.eql('<div><br/></div>');
16+
});
17+
it('closingSlash with space', function () {
18+
const root = parse('<div><br></div>', {
19+
voidTag: {
20+
closingSlash: true
21+
}
22+
});
23+
root.toString().should.eql('<div><br/></div>');
24+
});
25+
it('closingSlash with attribute ends with no blank space', function () {
26+
const root = parse('<foo attr=bar/>', {
27+
voidTag: {
28+
tags: ['foo'],
29+
closingSlash: true
30+
}
31+
});
32+
root.toString().should.eql('<foo attr=bar />');
33+
});
34+
it('closingSlash with attribute ends with blank space', function () {
35+
const root = parse('<div foo=bar />', {
36+
voidTag: {
37+
tags: ['foo'],
38+
closingSlash: true
39+
}
40+
});
41+
root.toString().should.eql('<div foo=bar ></div>');
42+
});
43+
});

0 commit comments

Comments
 (0)