Skip to content

Commit 3f4f198

Browse files
committed
separate parser from formatter
1 parent 6a304ad commit 3f4f198

File tree

4 files changed

+176
-174
lines changed

4 files changed

+176
-174
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import { lenientParse } from './lenient-parse';
2+
3+
describe('lenientParse()', () => {
4+
it('should parse base doucment correctly', () => {
5+
const document = `<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html><head></head><body style="background-color:#fff;"><h1>whatever</h1><input placeholder="hello world"/></body></html>`;
6+
expect(lenientParse(document)).toMatchSnapshot();
7+
});
8+
});
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
export interface HtmlTagProperty {
2+
name: string;
3+
value: string;
4+
}
5+
6+
export interface HtmlTag {
7+
type: 'tag';
8+
name: string;
9+
/**
10+
* Whether the html tag is self-closing, or a void element in spec nomenclature.
11+
*/
12+
void: boolean;
13+
properties: HtmlTagProperty[];
14+
children: HtmlNode[];
15+
}
16+
17+
/**
18+
* Something like the DOCTYPE for the document, or comments.
19+
*/
20+
export interface HtmlDoctype {
21+
type: 'doctype';
22+
content: string;
23+
}
24+
25+
export interface HtmlComment {
26+
type: 'comment';
27+
content: string;
28+
}
29+
30+
export interface HtmlText {
31+
type: 'text';
32+
content: string;
33+
}
34+
35+
export type HtmlNode = HtmlTag | HtmlDoctype | HtmlComment | HtmlText;
36+
37+
export const lenientParse = (html: string): HtmlNode[] => {
38+
const result: HtmlNode[] = [];
39+
40+
const stack: HtmlTag[] = []; // Stack to keep track of parent tags
41+
let index = 0; // Current parsing index
42+
while (index < html.length) {
43+
const currentParent = stack.length > 0 ? stack[stack.length - 1] : null;
44+
const addToTree = (node: HtmlNode) => {
45+
if (currentParent) {
46+
currentParent.children.push(node);
47+
} else {
48+
result.push(node);
49+
}
50+
};
51+
52+
const htmlObjectStart = html.indexOf('<', index);
53+
if (htmlObjectStart === -1) {
54+
if (index < html.length) {
55+
const content = html.slice(index);
56+
addToTree({ type: 'text', content });
57+
}
58+
59+
break;
60+
}
61+
if (htmlObjectStart > index) {
62+
const content = html.slice(index, htmlObjectStart);
63+
addToTree({ type: 'text', content });
64+
index = htmlObjectStart;
65+
}
66+
67+
if (html.startsWith('<!--', index)) {
68+
const commentEnd = html.indexOf('-->', index + '<!--'.length);
69+
if (commentEnd === -1) {
70+
// Assumes the rest of the document is part of this comment
71+
const content = html.slice(index + '<!--'.length);
72+
addToTree({ type: 'comment', content });
73+
break;
74+
}
75+
76+
const content = html.substring(index + '<!--'.length, commentEnd);
77+
addToTree({ type: 'comment', content });
78+
index = commentEnd + '-->'.length;
79+
continue;
80+
}
81+
82+
if (html.startsWith('<!DOCTYPE', index)) {
83+
const declEnd = html.indexOf('>', index + '<!DOCTYPE'.length);
84+
if (declEnd === -1) {
85+
// Assumes the rest of the document is part of this doctype
86+
const content = html.slice(index + '<!DOCTYPE'.length);
87+
addToTree({ type: 'doctype', content });
88+
break;
89+
}
90+
91+
const content = html.substring(index + '<!DOCTYPE'.length, declEnd);
92+
addToTree({ type: 'doctype', content });
93+
index = declEnd + '>'.length;
94+
continue;
95+
}
96+
97+
if (html.startsWith('</', index)) {
98+
const bracketEnd = html.indexOf('>', index + 2);
99+
const tagName = html.slice(index + 2, bracketEnd);
100+
101+
if (stack.length > 0 && stack[stack.length - 1].name === tagName) {
102+
stack.pop();
103+
} else {
104+
// Mismatched closing tag. In a simple lenient parser, we might just ignore it
105+
// or log a warning. For now, it's effectively ignored if no match on stack top.
106+
}
107+
index += 3 + tagName.length;
108+
continue;
109+
}
110+
111+
const tag: HtmlTag = {
112+
type: 'tag',
113+
name: '',
114+
void: false,
115+
properties: [],
116+
children: [],
117+
};
118+
119+
index++;
120+
while (!html.startsWith('>', index) && !html.startsWith('/>', index)) {
121+
const character = html[index];
122+
if (character !== ' ' && tag.name.length === 0) {
123+
const tagNameEndIndex = Math.min(
124+
html.indexOf(' ', index),
125+
html.indexOf('>', index),
126+
);
127+
tag.name = html.slice(index, tagNameEndIndex);
128+
index = tagNameEndIndex;
129+
continue;
130+
}
131+
132+
if (character !== ' ') {
133+
const propertyName = html.slice(index, html.indexOf('=', index));
134+
index = html.indexOf('=', index) + 1;
135+
136+
index = html.indexOf('"', index);
137+
const propertyValue = html.slice(
138+
index,
139+
html.indexOf('"', index + 1) + 1,
140+
);
141+
index = html.indexOf('"', index + 1) + 1;
142+
143+
tag.properties.push({
144+
name: propertyName,
145+
value: propertyValue,
146+
});
147+
continue;
148+
}
149+
150+
index++;
151+
}
152+
if (html.startsWith('/>', index)) {
153+
index++;
154+
tag.void = true;
155+
}
156+
if (html.startsWith('>', index)) {
157+
addToTree(tag);
158+
if (!tag.void) {
159+
stack.push(tag);
160+
}
161+
index++;
162+
}
163+
}
164+
165+
return result;
166+
};

packages/render/src/shared/utils/pretty.spec.ts

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import fs from 'node:fs';
22
import path from 'node:path';
3-
import { lenientParse, pretty, wrapText } from './pretty';
3+
import { pretty, wrapText } from './pretty';
44

55
const stripeHtml = fs.readFileSync(
66
path.resolve(__dirname, './tests/stripe-email.html'),
@@ -11,13 +11,6 @@ const codepenHtml = fs.readFileSync(
1111
'utf8',
1212
);
1313

14-
describe('lenientParse()', () => {
15-
it('should parse base doucment correctly', () => {
16-
const document = `<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html><head></head><body style="background-color:#fff;"><h1>whatever</h1><input placeholder="hello world"/></body></html>`;
17-
expect(lenientParse(document)).toMatchSnapshot();
18-
});
19-
});
20-
2114
describe('pretty', () => {
2215
it('should prettify base doucment correctly', () => {
2316
const document =

packages/render/src/shared/utils/pretty.ts

Lines changed: 1 addition & 166 deletions
Original file line numberDiff line numberDiff line change
@@ -1,169 +1,4 @@
1-
interface HtmlTagProperty {
2-
name: string;
3-
value: string;
4-
}
5-
6-
interface HtmlTag {
7-
type: 'tag';
8-
name: string;
9-
/**
10-
* Whether the html tag is self-closing, or a void element in spec nomenclature.
11-
*/
12-
void: boolean;
13-
properties: HtmlTagProperty[];
14-
children: HtmlNode[];
15-
}
16-
17-
/**
18-
* Something like the DOCTYPE for the document, or comments.
19-
*/
20-
interface HtmlDoctype {
21-
type: 'doctype';
22-
content: string;
23-
}
24-
25-
interface HtmlComment {
26-
type: 'comment';
27-
content: string;
28-
}
29-
30-
interface HtmlText {
31-
type: 'text';
32-
content: string;
33-
}
34-
35-
type HtmlNode = HtmlTag | HtmlDoctype | HtmlComment | HtmlText;
36-
37-
export const lenientParse = (html: string): HtmlNode[] => {
38-
const result: HtmlNode[] = [];
39-
40-
const stack: HtmlTag[] = []; // Stack to keep track of parent tags
41-
let index = 0; // Current parsing index
42-
while (index < html.length) {
43-
const currentParent = stack.length > 0 ? stack[stack.length - 1] : null;
44-
const addToTree = (node: HtmlNode) => {
45-
if (currentParent) {
46-
currentParent.children.push(node);
47-
} else {
48-
result.push(node);
49-
}
50-
};
51-
52-
const htmlObjectStart = html.indexOf('<', index);
53-
if (htmlObjectStart === -1) {
54-
if (index < html.length) {
55-
const content = html.slice(index);
56-
addToTree({ type: 'text', content });
57-
}
58-
59-
break;
60-
}
61-
if (htmlObjectStart > index) {
62-
const content = html.slice(index, htmlObjectStart);
63-
addToTree({ type: 'text', content });
64-
index = htmlObjectStart;
65-
}
66-
67-
if (html.startsWith('<!--', index)) {
68-
const commentEnd = html.indexOf('-->', index + '<!--'.length);
69-
if (commentEnd === -1) {
70-
// Assumes the rest of the document is part of this comment
71-
const content = html.slice(index + '<!--'.length);
72-
addToTree({ type: 'comment', content });
73-
break;
74-
}
75-
76-
const content = html.substring(index + '<!--'.length, commentEnd);
77-
addToTree({ type: 'comment', content });
78-
index = commentEnd + '-->'.length;
79-
continue;
80-
}
81-
82-
if (html.startsWith('<!DOCTYPE', index)) {
83-
const declEnd = html.indexOf('>', index + '<!DOCTYPE'.length);
84-
if (declEnd === -1) {
85-
// Assumes the rest of the document is part of this doctype
86-
const content = html.slice(index + '<!DOCTYPE'.length);
87-
addToTree({ type: 'doctype', content });
88-
break;
89-
}
90-
91-
const content = html.substring(index + '<!DOCTYPE'.length, declEnd);
92-
addToTree({ type: 'doctype', content });
93-
index = declEnd + '>'.length;
94-
continue;
95-
}
96-
97-
if (html.startsWith('</', index)) {
98-
const bracketEnd = html.indexOf('>', index + 2);
99-
const tagName = html.slice(index + 2, bracketEnd);
100-
101-
if (stack.length > 0 && stack[stack.length - 1].name === tagName) {
102-
stack.pop();
103-
} else {
104-
// Mismatched closing tag. In a simple lenient parser, we might just ignore it
105-
// or log a warning. For now, it's effectively ignored if no match on stack top.
106-
}
107-
index += 3 + tagName.length;
108-
continue;
109-
}
110-
111-
const tag: HtmlTag = {
112-
type: 'tag',
113-
name: '',
114-
void: false,
115-
properties: [],
116-
children: [],
117-
};
118-
119-
index++;
120-
while (!html.startsWith('>', index) && !html.startsWith('/>', index)) {
121-
const character = html[index];
122-
if (character !== ' ' && tag.name.length === 0) {
123-
const tagNameEndIndex = Math.min(
124-
html.indexOf(' ', index),
125-
html.indexOf('>', index),
126-
);
127-
tag.name = html.slice(index, tagNameEndIndex);
128-
index = tagNameEndIndex;
129-
continue;
130-
}
131-
132-
if (character !== ' ') {
133-
const propertyName = html.slice(index, html.indexOf('=', index));
134-
index = html.indexOf('=', index) + 1;
135-
136-
index = html.indexOf('"', index);
137-
const propertyValue = html.slice(
138-
index,
139-
html.indexOf('"', index + 1) + 1,
140-
);
141-
index = html.indexOf('"', index + 1) + 1;
142-
143-
tag.properties.push({
144-
name: propertyName,
145-
value: propertyValue,
146-
});
147-
continue;
148-
}
149-
150-
index++;
151-
}
152-
if (html.startsWith('/>', index)) {
153-
index++;
154-
tag.void = true;
155-
}
156-
if (html.startsWith('>', index)) {
157-
addToTree(tag);
158-
if (!tag.void) {
159-
stack.push(tag);
160-
}
161-
index++;
162-
}
163-
}
164-
165-
return result;
166-
};
1+
import { type HtmlNode, lenientParse } from './lenient-parse';
1672

1683
interface Options {
1694
/**

0 commit comments

Comments
 (0)