Skip to content

Commit cfdb371

Browse files
author
Robert Jackson
committed
Add support for parsing <!DOCTYPE html>
The [spec](https://html.spec.whatwg.org/multipage/syntax.html#the-doctype) says this about `<!DOCTYPE`: > DOCTYPEs are required for legacy reasons. When omitted, browsers tend to > use a different rendering mode that is incompatible with some > specifications. Including the DOCTYPE in a document ensures that the > browser makes a best-effort attempt at following the relevant > specifications.
1 parent 220bf73 commit cfdb371

File tree

4 files changed

+295
-1
lines changed

4 files changed

+295
-1
lines changed

src/evented-tokenizer.ts

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,194 @@ export default class EventedTokenizer {
185185
this.consume();
186186
this.transitionTo(TokenizerState.commentStart);
187187
this.delegate.beginComment();
188+
} else {
189+
let maybeDoctype = char.toUpperCase() + this.input.substring(this.index, this.index + 6).toUpperCase();
190+
191+
if (maybeDoctype === 'DOCTYPE') {
192+
this.consume();
193+
this.consume();
194+
this.consume();
195+
this.consume();
196+
this.consume();
197+
this.consume();
198+
this.transitionTo(TokenizerState.doctype);
199+
this.delegate.beginDoctype();
200+
}
201+
}
202+
},
203+
204+
doctype() {
205+
let char = this.consume();
206+
207+
if (isSpace(char)) {
208+
this.transitionTo(TokenizerState.beforeDoctypeName);
209+
}
210+
},
211+
212+
beforeDoctypeName() {
213+
let char = this.consume();
214+
215+
if (isSpace(char)) {
216+
return;
217+
} else {
218+
this.transitionTo(TokenizerState.doctypeName);
219+
this.delegate.appendToDoctypeName(char.toLowerCase());
220+
}
221+
},
222+
223+
doctypeName() {
224+
let char = this.consume();
225+
226+
if (isSpace(char)) {
227+
this.transitionTo(TokenizerState.afterDoctypeName);
228+
} else if (char === '>') {
229+
this.delegate.endDoctype();
230+
this.transitionTo(TokenizerState.beforeData);
231+
} else {
232+
this.delegate.appendToDoctypeName(char.toLowerCase());
233+
}
234+
},
235+
236+
afterDoctypeName() {
237+
let char = this.consume();
238+
239+
if (isSpace(char)) {
240+
return;
241+
} else if (char === '>') {
242+
this.delegate.endDoctype();
243+
this.transitionTo(TokenizerState.beforeData);
244+
} else {
245+
let nextSixChars = char.toUpperCase() + this.input.substring(this.index, this.index + 5).toUpperCase();
246+
247+
let isPublic = nextSixChars.toUpperCase() === 'PUBLIC';
248+
let isSystem = nextSixChars.toUpperCase() === 'SYSTEM';
249+
250+
if (isPublic || isSystem) {
251+
this.consume();
252+
this.consume();
253+
this.consume();
254+
this.consume();
255+
this.consume();
256+
this.consume();
257+
}
258+
259+
if (isPublic) {
260+
this.transitionTo(TokenizerState.afterDoctypePublicKeyword);
261+
} else if (isSystem) {
262+
this.transitionTo(TokenizerState.afterDoctypeSystemKeyword);
263+
}
264+
}
265+
},
266+
267+
afterDoctypePublicKeyword() {
268+
let char = this.peek();
269+
270+
if (isSpace(char)) {
271+
this.transitionTo(TokenizerState.beforeDoctypePublicIdentifier);
272+
this.consume();
273+
} else if (char === '"') {
274+
this.transitionTo(TokenizerState.doctypePublicIdentifierDoubleQuoted);
275+
this.consume();
276+
} else if (char === "'") {
277+
this.transitionTo(TokenizerState.doctypePublicIdentifierSingleQuoted);
278+
this.consume();
279+
} else if (char === '>') {
280+
this.consume();
281+
this.delegate.endDoctype();
282+
this.transitionTo(TokenizerState.beforeData);
283+
}
284+
},
285+
286+
doctypePublicIdentifierDoubleQuoted() {
287+
let char = this.consume();
288+
289+
if (char === '"') {
290+
this.transitionTo(TokenizerState.afterDoctypePublicIdentifier);
291+
} else if (char === '>') {
292+
this.delegate.endDoctype();
293+
this.transitionTo(TokenizerState.beforeData);
294+
} else {
295+
this.delegate.appendToDoctypePublicIdentifier(char);
296+
}
297+
},
298+
299+
doctypePublicIdentifierSingleQuoted() {
300+
let char = this.consume();
301+
302+
if (char === "'") {
303+
this.transitionTo(TokenizerState.afterDoctypePublicIdentifier);
304+
} else if (char === '>') {
305+
this.delegate.endDoctype();
306+
this.transitionTo(TokenizerState.beforeData);
307+
} else {
308+
this.delegate.appendToDoctypePublicIdentifier(char);
309+
}
310+
},
311+
312+
afterDoctypePublicIdentifier() {
313+
let char = this.consume();
314+
315+
if (isSpace(char)) {
316+
this.transitionTo(TokenizerState.betweenDoctypePublicAndSystemIdentifiers);
317+
} else if (char === '>') {
318+
this.delegate.endDoctype();
319+
this.transitionTo(TokenizerState.beforeData);
320+
} else if (char === '"') {
321+
this.transitionTo(TokenizerState.doctypeSystemIdentifierDoubleQuoted);
322+
} else if (char === "'") {
323+
this.transitionTo(TokenizerState.doctypeSystemIdentifierSingleQuoted);
324+
}
325+
},
326+
327+
betweenDoctypePublicAndSystemIdentifiers() {
328+
let char = this.consume();
329+
330+
if (isSpace(char)) {
331+
return;
332+
} else if (char === '>') {
333+
this.delegate.endDoctype();
334+
this.transitionTo(TokenizerState.beforeData);
335+
} else if (char === '"') {
336+
this.transitionTo(TokenizerState.doctypeSystemIdentifierDoubleQuoted);
337+
} else if (char === "'") {
338+
this.transitionTo(TokenizerState.doctypeSystemIdentifierSingleQuoted);
339+
}
340+
},
341+
342+
doctypeSystemIdentifierDoubleQuoted() {
343+
let char = this.consume();
344+
345+
if (char === '"') {
346+
this.transitionTo(TokenizerState.afterDoctypeSystemIdentifier);
347+
} else if (char === '>') {
348+
this.delegate.endDoctype();
349+
this.transitionTo(TokenizerState.beforeData);
350+
} else {
351+
this.delegate.appendToDoctypeSystemIdentifier(char);
352+
}
353+
},
354+
355+
doctypeSystemIdentifierSingleQuoted() {
356+
let char = this.consume();
357+
358+
if (char === "'") {
359+
this.transitionTo(TokenizerState.afterDoctypeSystemIdentifier);
360+
} else if (char === '>') {
361+
this.delegate.endDoctype();
362+
this.transitionTo(TokenizerState.beforeData);
363+
} else {
364+
this.delegate.appendToDoctypeSystemIdentifier(char);
365+
}
366+
},
367+
368+
afterDoctypeSystemIdentifier() {
369+
let char = this.consume();
370+
371+
if (isSpace(char)) {
372+
return;
373+
} else if (char === '>') {
374+
this.delegate.endDoctype();
375+
this.transitionTo(TokenizerState.beforeData);
188376
}
189377
},
190378

src/tokenizer.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,41 @@ export default class Tokenizer implements TokenizerDelegate {
9999

100100
// Data
101101

102+
beginDoctype() {
103+
this.push({
104+
type: TokenType.Doctype,
105+
name: '',
106+
});
107+
}
108+
109+
appendToDoctypeName(char: string) {
110+
this.current(TokenType.Doctype).name += char;
111+
}
112+
113+
appendToDoctypePublicIdentifier(char: string) {
114+
let doctype = this.current(TokenType.Doctype);
115+
116+
if (doctype.publicIdentifier === undefined) {
117+
doctype.publicIdentifier = char;
118+
} else {
119+
doctype.publicIdentifier += char;
120+
}
121+
}
122+
123+
appendToDoctypeSystemIdentifier(char: string) {
124+
let doctype = this.current(TokenType.Doctype);
125+
126+
if (doctype.systemIdentifier === undefined) {
127+
doctype.systemIdentifier = char;
128+
} else {
129+
doctype.systemIdentifier += char;
130+
}
131+
}
132+
133+
endDoctype() {
134+
this.addLocInfo();
135+
}
136+
102137
beginData() {
103138
this.push({
104139
type: TokenType.Chars,

src/types.ts

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ export interface TokenBase<T extends TokenType> {
3030
loc?: Location;
3131
}
3232

33+
export interface Doctype extends TokenBase<TokenType.Doctype> {
34+
name: string;
35+
publicIdentifier?: string;
36+
systemIdentifier?: string;
37+
}
38+
3339
export interface StartTag extends TokenBase<TokenType.StartTag> {
3440
tagName: string;
3541
attributes: Attribute[];
@@ -48,9 +54,10 @@ export interface Comment extends TokenBase<TokenType.Comment> {
4854
chars: string;
4955
}
5056

51-
export type Token = StartTag | EndTag | Chars | Comment;
57+
export type Token = StartTag | EndTag | Chars | Comment | Doctype;
5258

5359
export const enum TokenType {
60+
Doctype = 'Doctype',
5461
StartTag = 'StartTag',
5562
EndTag = 'EndTag',
5663
Chars = 'Chars',
@@ -62,13 +69,20 @@ export interface TokenMap {
6269
EndTag: EndTag;
6370
Chars: Chars;
6471
Comment: Comment;
72+
Doctype: Doctype;
6573
}
6674

6775
export interface TokenizerDelegate {
6876
reset(): void;
6977
finishData(): void;
7078
tagOpen(): void;
7179

80+
beginDoctype(): void;
81+
appendToDoctypeName(char: string): void;
82+
appendToDoctypePublicIdentifier(char: string): void;
83+
appendToDoctypeSystemIdentifier(char: string): void;
84+
endDoctype(): void;
85+
7286
beginData(): void;
7387
appendToData(char: string): void;
7488

tests/tokenizer-tests.ts

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import {
22
tokenize,
3+
Doctype,
34
StartTag,
45
EndTag,
56
Comment,
@@ -11,6 +12,26 @@ import {
1112

1213
QUnit.module('simple-html-tokenizer - tokenizer');
1314

15+
QUnit.test('Doctype', function(assert) {
16+
let tokens = tokenize('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">');
17+
assert.deepEqual(tokens, [ doctype('-//W3C//DTD HTML 4.01//EN', 'http://www.w3.org/TR/html4/strict.dtd') ], 'Standard HTML 4.01 Strict doctype');
18+
19+
tokens = tokenize('<!DOCTYPE html><html><body></body></html>');
20+
assert.deepEqual(tokens, [
21+
doctype(),
22+
startTag('html'),
23+
startTag('body'),
24+
endTag('body'),
25+
endTag('html'),
26+
], 'DOCTYPE is included in tokens');
27+
28+
tokens = tokenize('<!-- comment --><!DOCTYPE html>');
29+
assert.deepEqual(tokens, [comment(' comment '), doctype()], 'DOCTYPE after comments is valid');
30+
31+
tokens = tokenize('<!-- comment --><!DOCTYPE html PUBLIC >');
32+
assert.deepEqual(tokens, [comment(' comment '), doctype()], 'DOCTYPE after comments is valid');
33+
});
34+
1435
QUnit.test('Simple content', function(assert) {
1536
let tokens = tokenize('hello');
1637
assert.deepEqual(tokens, [chars('hello')]);
@@ -289,6 +310,25 @@ QUnit.test('An Emberish named arg invocation', function(assert) {
289310
assert.deepEqual(tokens, [startTag('@foo'), endTag('@foo')]);
290311
});
291312

313+
QUnit.test('Parsing <script>s out of a complext HTML document [stefanpenner/find-scripts-srcs-in-document#1]', function(assert) {
314+
let input = `<!DOCTYPE html><html><head><script src="/foo.js"></script><script src="/bar.js"></script><script src="/baz.js"></script></head></html>`;
315+
316+
let tokens = tokenize(input);
317+
assert.deepEqual(tokens, [
318+
doctype(),
319+
startTag('html'),
320+
startTag('head'),
321+
startTag('script', [['src','/foo.js', true]]),
322+
endTag('script'),
323+
startTag('script', [['src','/bar.js', true]]),
324+
endTag('script'),
325+
startTag('script', [['src','/baz.js', true]]),
326+
endTag('script'),
327+
endTag('head'),
328+
endTag('html'),
329+
]);
330+
});
331+
292332
QUnit.module('simple-html-tokenizer - preprocessing');
293333

294334
QUnit.test('Carriage returns are replaced with line feeds', function(assert) {
@@ -392,6 +432,23 @@ function endTag(tagName: string): EndTag {
392432
};
393433
}
394434

435+
function doctype(publicIdentifier?: string, systemIdentifier?: string): Doctype {
436+
let doctype: Doctype = {
437+
type: TokenType.Doctype,
438+
name: 'html',
439+
};
440+
441+
if (publicIdentifier) {
442+
doctype.publicIdentifier = publicIdentifier;
443+
}
444+
445+
if (systemIdentifier) {
446+
doctype.systemIdentifier = systemIdentifier;
447+
}
448+
449+
return doctype;
450+
}
451+
395452
function locInfo(
396453
token: Token,
397454
startLine: number,

0 commit comments

Comments
 (0)