Skip to content

Commit c22c3fa

Browse files
Make consecutive hyphens in comments a non-error
Also allow `<!-->` at (IE conditional) comment end See whatwg/html#1356 See whatwg/html#1456
1 parent 3f48926 commit c22c3fa

File tree

2 files changed

+192
-19
lines changed

2 files changed

+192
-19
lines changed

src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2009-2013 Mozilla Foundation
2+
* Copyright (c) 2009-2017 Mozilla Foundation
33
*
44
* Permission is hereby granted, free of charge, to any person obtaining a
55
* copy of this software and associated documentation files (the "Software"),
@@ -395,8 +395,8 @@ private boolean isAstralPrivateUse(int c) {
395395
err("Nameless doctype.");
396396
}
397397

398-
@Override protected void errConsecutiveHyphens() throws SAXException {
399-
err("Consecutive hyphens did not terminate a comment. \u201C--\u201D is not permitted inside a comment, but e.g. \u201C- -\u201D is.");
398+
@Override protected void errNestedComment() throws SAXException {
399+
err("Saw \u201C<!--\u201D within a comment. Probable cause: Nested comment (not allowed).");
400400
}
401401

402402
@Override protected void errPrematureEndOfComment() throws SAXException {
@@ -678,10 +678,6 @@ private boolean isAstralPrivateUse(int c) {
678678
err("Missing space before doctype name.");
679679
}
680680

681-
@Override protected void errHyphenHyphenBang() throws SAXException {
682-
err("\u201C--!\u201D found in comment.");
683-
}
684-
685681
@Override protected void errNcrControlChar() throws SAXException {
686682
err("Character reference expands to a control character ("
687683
+ toUPlusString((char) value) + ").");

src/nu/validator/htmlparser/impl/Tokenizer.java

Lines changed: 189 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,16 @@ public class Tokenizer implements Locator, Locator2 {
221221

222222
public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
223223

224+
public static final int AMBIGUOUS_AMPERSAND = 75;
225+
226+
public static final int COMMENT_LESSTHAN = 76;
227+
228+
public static final int COMMENT_LESSTHAN_BANG = 77;
229+
230+
public static final int COMMENT_LESSTHAN_BANG_DASH = 78;
231+
232+
public static final int COMMENT_LESSTHAN_BANG_DASH_DASH = 79;
233+
224234
/**
225235
* Magic value for UTF-16 operations.
226236
*/
@@ -1029,9 +1039,8 @@ private void maybeAppendSpaceToBogusComment() throws SAXException {
10291039

10301040
// ]NOCPP]
10311041

1032-
@Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c)
1042+
@Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c, boolean reportedConsecutiveHyphens)
10331043
throws SAXException {
1034-
errConsecutiveHyphens();
10351044
// [NOCPP[
10361045
switch (commentPolicy) {
10371046
case ALTER_INFOSET:
@@ -1042,7 +1051,9 @@ private void maybeAppendSpaceToBogusComment() throws SAXException {
10421051
appendStrBuf('-');
10431052
// CPPONLY: MOZ_FALLTHROUGH;
10441053
case ALLOW:
1045-
warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1054+
if (!reportedConsecutiveHyphens) {
1055+
warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1056+
}
10461057
// ]NOCPP]
10471058
appendStrBuf(c);
10481059
// [NOCPP[
@@ -1464,6 +1475,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
14641475
@SuppressWarnings("unused") private int stateLoop(int state, char c,
14651476
int pos, @NoLength char[] buf, boolean reconsume, int returnState,
14661477
int endPos) throws SAXException {
1478+
boolean reportedConsecutiveHyphens = false;
14671479
/*
14681480
* Idioms used in this code:
14691481
*
@@ -2540,6 +2552,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
25402552
}
25412553
// CPPONLY: MOZ_FALLTHROUGH;
25422554
case COMMENT_START:
2555+
reportedConsecutiveHyphens = false;
25432556
commentstartloop: for (;;) {
25442557
if (++pos == endPos) {
25452558
break stateloop;
@@ -2572,6 +2585,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
25722585
*/
25732586
state = transition(state, Tokenizer.DATA, reconsume, pos);
25742587
continue stateloop;
2588+
case '<':
2589+
appendStrBuf(c);
2590+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2591+
continue stateloop;
25752592
case '\r':
25762593
appendStrBufCarriageReturn();
25772594
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -2617,6 +2634,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
26172634
state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
26182635
break commentloop;
26192636
// continue stateloop;
2637+
case '<':
2638+
appendStrBuf(c);
2639+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2640+
continue stateloop;
26202641
case '\r':
26212642
appendStrBufCarriageReturn();
26222643
break stateloop;
@@ -2659,6 +2680,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
26592680
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
26602681
break commentenddashloop;
26612682
// continue stateloop;
2683+
case '<':
2684+
appendStrBuf(c);
2685+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2686+
continue stateloop;
26622687
case '\r':
26632688
appendStrBufCarriageReturn();
26642689
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -2713,11 +2738,16 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
27132738
* Append a U+002D HYPHEN-MINUS (-) character to
27142739
* the comment token's data.
27152740
*/
2716-
adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2741+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2742+
reportedConsecutiveHyphens = true;
27172743
/*
27182744
* Stay in the comment end state.
27192745
*/
27202746
continue;
2747+
case '<':
2748+
appendStrBuf(c);
2749+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2750+
continue stateloop;
27212751
case '\r':
27222752
adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
27232753
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -2727,7 +2757,6 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
27272757
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
27282758
continue stateloop;
27292759
case '!':
2730-
errHyphenHyphenBang();
27312760
appendStrBuf(c);
27322761
state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
27332762
continue stateloop;
@@ -2740,7 +2769,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
27402769
* and the input character to the comment
27412770
* token's data.
27422771
*/
2743-
adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2772+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2773+
reportedConsecutiveHyphens = true;
27442774
/*
27452775
* Switch to the comment state.
27462776
*/
@@ -2810,6 +2840,148 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
28102840
continue stateloop;
28112841
}
28122842
}
2843+
case COMMENT_LESSTHAN:
2844+
for (;;) {
2845+
if (++pos == endPos) {
2846+
break stateloop;
2847+
}
2848+
c = checkChar(buf, pos);
2849+
switch (c) {
2850+
case '!':
2851+
appendStrBuf(c);
2852+
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG, reconsume, pos);
2853+
continue stateloop;
2854+
case '<':
2855+
appendStrBuf(c);
2856+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2857+
continue stateloop;
2858+
case '-':
2859+
appendStrBuf(c);
2860+
state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2861+
continue stateloop;
2862+
case '\r':
2863+
appendStrBufCarriageReturn();
2864+
break stateloop;
2865+
case '\n':
2866+
appendStrBufLineFeed();
2867+
continue;
2868+
case '\u0000':
2869+
c = '\uFFFD';
2870+
// fall thru
2871+
default:
2872+
appendStrBuf(c);
2873+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2874+
continue stateloop;
2875+
}
2876+
}
2877+
case COMMENT_LESSTHAN_BANG:
2878+
for (;;) {
2879+
if (++pos == endPos) {
2880+
break stateloop;
2881+
}
2882+
c = checkChar(buf, pos);
2883+
switch (c) {
2884+
case '-':
2885+
appendStrBuf(c);
2886+
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH, reconsume, pos);
2887+
continue stateloop;
2888+
case '<':
2889+
appendStrBuf(c);
2890+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2891+
continue stateloop;
2892+
case '\r':
2893+
appendStrBufCarriageReturn();
2894+
break stateloop;
2895+
case '\n':
2896+
appendStrBufLineFeed();
2897+
continue;
2898+
case '\u0000':
2899+
c = '\uFFFD';
2900+
// fall thru
2901+
default:
2902+
appendStrBuf(c);
2903+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2904+
continue stateloop;
2905+
}
2906+
}
2907+
case COMMENT_LESSTHAN_BANG_DASH:
2908+
for (;;) {
2909+
if (++pos == endPos) {
2910+
break stateloop;
2911+
}
2912+
c = checkChar(buf, pos);
2913+
switch (c) {
2914+
case '-':
2915+
appendStrBuf(c);
2916+
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH_DASH, reconsume, pos);
2917+
continue stateloop;
2918+
case '<':
2919+
appendStrBuf(c);
2920+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2921+
continue stateloop;
2922+
case '\r':
2923+
appendStrBufCarriageReturn();
2924+
break stateloop;
2925+
case '\n':
2926+
appendStrBufLineFeed();
2927+
continue;
2928+
case '\u0000':
2929+
c = '\uFFFD';
2930+
// fall thru
2931+
default:
2932+
appendStrBuf(c);
2933+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2934+
continue stateloop;
2935+
}
2936+
}
2937+
case COMMENT_LESSTHAN_BANG_DASH_DASH:
2938+
for (;;) {
2939+
if (++pos == endPos) {
2940+
break stateloop;
2941+
}
2942+
c = checkChar(buf, pos);
2943+
switch (c) {
2944+
case '>':
2945+
appendStrBuf(c);
2946+
emitComment(3, pos);
2947+
state = transition(state, Tokenizer.DATA, reconsume, pos);
2948+
continue stateloop;
2949+
case '-':
2950+
errNestedComment();
2951+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2952+
reportedConsecutiveHyphens = true;
2953+
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2954+
continue stateloop;
2955+
case '\r':
2956+
errNestedComment();
2957+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2958+
reportedConsecutiveHyphens = true;
2959+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2960+
break stateloop;
2961+
case '\n':
2962+
errNestedComment();
2963+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2964+
reportedConsecutiveHyphens = true;
2965+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2966+
continue;
2967+
case '\u0000':
2968+
c = '\uFFFD';
2969+
// fall thru
2970+
case '!':
2971+
errNestedComment();
2972+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2973+
reportedConsecutiveHyphens = true;
2974+
state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
2975+
continue stateloop;
2976+
default:
2977+
errNestedComment();
2978+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2979+
reportedConsecutiveHyphens = true;
2980+
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2981+
continue stateloop;
2982+
}
2983+
}
2984+
// XXX reorder point
28132985
case COMMENT_START_DASH:
28142986
if (++pos == endPos) {
28152987
break stateloop;
@@ -2838,6 +3010,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
28383010
*/
28393011
state = transition(state, Tokenizer.DATA, reconsume, pos);
28403012
continue stateloop;
3013+
case '<':
3014+
appendStrBuf(c);
3015+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
3016+
continue stateloop;
28413017
case '\r':
28423018
appendStrBufCarriageReturn();
28433019
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -5957,13 +6133,13 @@ private void initDoctypeFields() {
59576133
@Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
59586134
throws SAXException {
59596135
silentCarriageReturn();
5960-
adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
6136+
adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false);
59616137
}
59626138

59636139
@Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed()
59646140
throws SAXException {
59656141
silentLineFeed();
5966-
adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
6142+
adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false);
59676143
}
59686144

59696145
@Inline private void appendStrBufLineFeed() {
@@ -6268,6 +6444,8 @@ public void eof() throws SAXException {
62686444
break eofloop;
62696445
case COMMENT_START:
62706446
case COMMENT:
6447+
case COMMENT_LESSTHAN:
6448+
case COMMENT_LESSTHAN_BANG:
62716449
/*
62726450
* EOF Parse error.
62736451
*/
@@ -6279,6 +6457,7 @@ public void eof() throws SAXException {
62796457
*/
62806458
break eofloop;
62816459
case COMMENT_END:
6460+
case COMMENT_LESSTHAN_BANG_DASH_DASH:
62826461
errEofInComment();
62836462
/* Emit the comment token. */
62846463
emitComment(2, 0);
@@ -6288,6 +6467,7 @@ public void eof() throws SAXException {
62886467
break eofloop;
62896468
case COMMENT_END_DASH:
62906469
case COMMENT_START_DASH:
6470+
case COMMENT_LESSTHAN_BANG_DASH:
62916471
errEofInComment();
62926472
/* Emit the comment token. */
62936473
emitComment(1, 0);
@@ -6917,7 +7097,7 @@ protected void errGtInPublicId() throws SAXException {
69177097
protected void errNamelessDoctype() throws SAXException {
69187098
}
69197099

6920-
protected void errConsecutiveHyphens() throws SAXException {
7100+
protected void errNestedComment() throws SAXException {
69217101
}
69227102

69237103
protected void errPrematureEndOfComment() throws SAXException {
@@ -7060,9 +7240,6 @@ protected void errExpectedSystemId() throws SAXException {
70607240
protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
70617241
}
70627242

7063-
protected void errHyphenHyphenBang() throws SAXException {
7064-
}
7065-
70667243
protected void errNcrControlChar() throws SAXException {
70677244
}
70687245

0 commit comments

Comments
 (0)