Skip to content

Commit f9b3796

Browse files
Sync w/ “Changing the encoding while parsing” algo
This change brings the parser into conformance with current requirements in the “Changing the encoding while parsing” algorithm in the HTML spec.
1 parent ceb9fe3 commit f9b3796

File tree

1 file changed

+80
-32
lines changed

1 file changed

+80
-32
lines changed

src/nu/validator/htmlparser/io/Driver.java

Lines changed: 80 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@
2121
* DEALINGS IN THE SOFTWARE.
2222
*/
2323

24+
/*
25+
* The comments following this one that use the same comment syntax as this
26+
* comment are quotes from the HTML Standard at https://html.spec.whatwg.org/
27+
* as of 10 September 2020. That document came with this statement:
28+
* Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is
29+
* licensed under a Creative Commons Attribution 4.0 International License.
30+
*/
31+
2432
package nu.validator.htmlparser.io;
2533

2634
import java.io.IOException;
@@ -197,9 +205,8 @@ public void tokenize(InputSource is) throws SAXException, IOException {
197205
tokenizer.getErrorHandler(), tokenizer, this, heuristics);
198206
} else {
199207
if (this.characterEncoding != Encoding.UTF8) {
200-
errorWithoutLocation("Legacy encoding \u201C"
201-
+ this.characterEncoding.getCanonName()
202-
+ "\u201D used. Documents must use UTF-8.");
208+
errorWithoutLocation(Encoding.msgLegacyEncoding(
209+
this.characterEncoding.getCanonName()));
203210
}
204211
becomeConfident();
205212
this.reader = new HtmlInputStreamReader(inputStream,
@@ -333,50 +340,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) {
333340
}
334341
}
335342

343+
private void errInternalActualDiffer(String internalCharset, String actual)
344+
throws SAXException {
345+
if (!internalCharset.equals(actual)) {
346+
tokenizer.errTreeBuilder(
347+
"Ignoring internal encoding declaration \u201C"
348+
+ internalCharset + "\u201D, which disagrees with"
349+
+ " the actual encoding of the document (\u201C"
350+
+ actual + "\u201D).");
351+
}
352+
}
353+
336354
public boolean internalEncodingDeclaration(String internalCharset)
337355
throws SAXException {
356+
String actual = characterEncoding.getCanonName();
357+
if (confidence == Confidence.CERTAIN) {
358+
errInternalActualDiffer(internalCharset, actual);
359+
return true;
360+
}
361+
/* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */
338362
try {
363+
if ("utf-16be".equals(actual) || "utf-16le".equals(actual)) {
364+
errInternalActualDiffer(internalCharset, actual);
365+
/*
366+
* 1. If the encoding that is already being used to interpret
367+
* the input stream is a UTF-16 encoding, then set the
368+
* confidence to certain and return. The new encoding is ignored
369+
* becomeConfident();
370+
*/
371+
return true;
372+
}
339373
internalCharset = internalCharset.toLowerCase();
340374
Encoding cs = Encoding.forName(internalCharset);
341375
if ("utf-16be".equals(internalCharset)
342376
|| "utf-16le".equals(internalCharset)) {
343-
tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
344-
+ internalCharset
345-
+ "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D.");
377+
/*
378+
* 2. If the new encoding is a UTF-16 encoding, then change it
379+
* to UTF-8.
380+
*/
381+
tokenizer.errTreeBuilder(
382+
Encoding.msgIgnoredCharset(internalCharset, "utf-8"));
346383
cs = Encoding.UTF8;
347384
internalCharset = "utf-8";
348-
} else {
349-
cs = Encoding.forName(internalCharset);
350-
}
351-
Encoding actual = cs.getActualHtmlEncoding();
352-
if (actual == null) {
353-
actual = cs;
385+
} else if ("x-user-defined".equals(internalCharset)) {
386+
/*
387+
* 3. If the new encoding is x-user-defined, then change it to
388+
* windows-1252.
389+
*/
390+
tokenizer.errTreeBuilder(Encoding.msgIgnoredCharset(
391+
"x-user-defined", "windows-1252"));
392+
cs = Encoding.WINDOWS1252;
393+
internalCharset = "windows-1252";
354394
}
355395
if (characterEncoding == null) {
356396
// Reader case
357397
return true;
358398
}
359-
if (characterEncoding == actual) {
399+
if (characterEncoding == cs) {
400+
/*
401+
* 4. If the new encoding is identical or equivalent to the
402+
* encoding that is already being used to interpret the input
403+
* stream, then set the confidence to certain and return.
404+
*/
360405
becomeConfident();
361406
return true;
362407
}
363-
if (confidence == Confidence.CERTAIN && actual != characterEncoding) {
364-
tokenizer.errTreeBuilder("Internal encoding declaration \u201C"
365-
+ internalCharset
366-
+ "\u201D disagrees with the actual encoding of the document (\u201C"
367-
+ characterEncoding.getCanonName() + "\u201D).");
368-
} else {
369-
Encoding newEnc = whineAboutEncodingAndReturnCanonical(
370-
internalCharset, cs);
371-
tokenizer.errTreeBuilder("Changing character encoding \u201C"
372-
+ internalCharset + "\u201D and reparsing.");
373-
characterEncoding = newEnc;
374-
throw new ReparseException();
375-
}
376-
return true;
408+
/*
409+
* 6. Otherwise, navigate to the document again, with
410+
* historyHandling set to "replace", and using the same source
411+
* browsing context, but this time skip the encoding sniffing
412+
* algorithm and instead just set the encoding to the new encoding
413+
*/
414+
Encoding newEnc = whineAboutEncodingAndReturnCanonical(
415+
internalCharset, cs);
416+
tokenizer.errTreeBuilder("Changing character encoding to \u201C"
417+
+ internalCharset + "\u201D and reparsing.");
418+
characterEncoding = newEnc;
419+
// Note: We intentionally don’t call becomeConfident() at this
420+
// point. If we did, it would end up causing the exception
421+
// java.lang.IllegalStateException: rewind() after willNotRewind()
422+
// to be thrown later. So we are departing here from strictly
423+
// following the ordering in the corresponding spec language, which
424+
// specifies setting the confidence to "certain" at this point.
425+
throw new ReparseException();
377426
} catch (UnsupportedCharsetException e) {
378-
tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C"
379-
+ internalCharset + "\u201D.");
427+
tokenizer.errTreeBuilder(
428+
Encoding.msgBadInternalCharset(internalCharset));
380429
return false;
381430
}
382431
}
@@ -436,8 +485,7 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
436485
}
437486
return whineAboutEncodingAndReturnCanonical(encoding, cs);
438487
} catch (UnsupportedCharsetException e) {
439-
tokenizer.err("Unsupported character encoding name: \u201C" + encoding
440-
+ "\u201D. Will sniff.");
488+
tokenizer.err(Encoding.msgBadEncoding(encoding) + " Will sniff.");
441489
swallowBom = true;
442490
}
443491
return null; // keep the compiler happy
@@ -453,7 +501,7 @@ protected Encoding whineAboutEncodingAndReturnCanonical(String encoding,
453501
Encoding cs) throws SAXException {
454502
String canonName = cs.getCanonName();
455503
if (!canonName.equals(encoding)) {
456-
tokenizer.err(Encoding.msgNotPreferredName(encoding, canonName));
504+
tokenizer.err(Encoding.msgNotCanonicalName(encoding, canonName));
457505
}
458506
return cs;
459507
}

0 commit comments

Comments
 (0)