Skip to content

Commit 0cfe6d2

Browse files
Sync w/ “Changing the encoding while parsing” algo
This change brings the parser into conformance with current requirements in the “Changing the encoding while parsing” algorithm in the HTML spec.
1 parent edfaeff commit 0cfe6d2

File tree

1 file changed

+80
-32
lines changed

1 file changed

+80
-32
lines changed

src/nu/validator/htmlparser/io/Driver.java

Lines changed: 80 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@
2121
* DEALINGS IN THE SOFTWARE.
2222
*/
2323

24+
/*
25+
* The comments following this one that use the same comment syntax as this
26+
* comment are quotes from the HTML Standard at https://html.spec.whatwg.org/
27+
* as of 10 September 2020. That document came with this statement:
28+
* Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is
29+
* licensed under a Creative Commons Attribution 4.0 International License.
30+
*/
31+
2432
package nu.validator.htmlparser.io;
2533

2634
import java.io.IOException;
@@ -214,9 +222,8 @@ public void tokenize(InputSource is, int bufferSize)
214222
tokenizer.getErrorHandler(), tokenizer, this, heuristics);
215223
} else {
216224
if (this.characterEncoding != Encoding.UTF8) {
217-
errorWithoutLocation("Legacy encoding \u201C"
218-
+ this.characterEncoding.getCanonName()
219-
+ "\u201D used. Documents must use UTF-8.");
225+
errorWithoutLocation(Encoding.msgLegacyEncoding(
226+
this.characterEncoding.getCanonName()));
220227
}
221228
becomeConfident();
222229
this.reader = new HtmlInputStreamReader(inputStream,
@@ -350,50 +357,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) {
350357
}
351358
}
352359

360+
private void errInternalActualDiffer(String internalCharset, String actual)
361+
throws SAXException {
362+
if (!internalCharset.equals(actual)) {
363+
tokenizer.errTreeBuilder(
364+
"Ignoring internal encoding declaration \u201C"
365+
+ internalCharset + "\u201D, which disagrees with"
366+
+ " the actual encoding of the document (\u201C"
367+
+ actual + "\u201D).");
368+
}
369+
}
370+
353371
public boolean internalEncodingDeclaration(String internalCharset)
354372
throws SAXException {
373+
String actual = characterEncoding.getCanonName();
374+
if (confidence == Confidence.CERTAIN) {
375+
errInternalActualDiffer(internalCharset, actual);
376+
return true;
377+
}
378+
/* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */
355379
try {
380+
if ("utf-16be".equals(actual) || "utf-16le".equals(actual)) {
381+
errInternalActualDiffer(internalCharset, actual);
382+
/*
383+
* 1. If the encoding that is already being used to interpret
384+
* the input stream is a UTF-16 encoding, then set the
385+
* confidence to certain and return. The new encoding is ignored
386+
* becomeConfident();
387+
*/
388+
return true;
389+
}
356390
internalCharset = internalCharset.toLowerCase();
357391
Encoding cs = Encoding.forName(internalCharset);
358392
if ("utf-16be".equals(internalCharset)
359393
|| "utf-16le".equals(internalCharset)) {
360-
tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
361-
+ internalCharset
362-
+ "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D.");
394+
/*
395+
* 2. If the new encoding is a UTF-16 encoding, then change it
396+
* to UTF-8.
397+
*/
398+
tokenizer.errTreeBuilder(
399+
Encoding.msgIgnoredCharset(internalCharset, "utf-8"));
363400
cs = Encoding.UTF8;
364401
internalCharset = "utf-8";
365-
} else {
366-
cs = Encoding.forName(internalCharset);
367-
}
368-
Encoding actual = cs.getActualHtmlEncoding();
369-
if (actual == null) {
370-
actual = cs;
402+
} else if ("x-user-defined".equals(internalCharset)) {
403+
/*
404+
* 3. If the new encoding is x-user-defined, then change it to
405+
* windows-1252.
406+
*/
407+
tokenizer.errTreeBuilder(Encoding.msgIgnoredCharset(
408+
"x-user-defined", "windows-1252"));
409+
cs = Encoding.WINDOWS1252;
410+
internalCharset = "windows-1252";
371411
}
372412
if (characterEncoding == null) {
373413
// Reader case
374414
return true;
375415
}
376-
if (characterEncoding == actual) {
416+
if (characterEncoding == cs) {
417+
/*
418+
* 4. If the new encoding is identical or equivalent to the
419+
* encoding that is already being used to interpret the input
420+
* stream, then set the confidence to certain and return.
421+
*/
377422
becomeConfident();
378423
return true;
379424
}
380-
if (confidence == Confidence.CERTAIN && actual != characterEncoding) {
381-
tokenizer.errTreeBuilder("Internal encoding declaration \u201C"
382-
+ internalCharset
383-
+ "\u201D disagrees with the actual encoding of the document (\u201C"
384-
+ characterEncoding.getCanonName() + "\u201D).");
385-
} else {
386-
Encoding newEnc = whineAboutEncodingAndReturnCanonical(
387-
internalCharset, cs);
388-
tokenizer.errTreeBuilder("Changing character encoding \u201C"
389-
+ internalCharset + "\u201D and reparsing.");
390-
characterEncoding = newEnc;
391-
throw new ReparseException();
392-
}
393-
return true;
425+
/*
426+
* 6. Otherwise, navigate to the document again, with
427+
* historyHandling set to "replace", and using the same source
428+
* browsing context, but this time skip the encoding sniffing
429+
* algorithm and instead just set the encoding to the new encoding
430+
*/
431+
Encoding newEnc = whineAboutEncodingAndReturnCanonical(
432+
internalCharset, cs);
433+
tokenizer.errTreeBuilder("Changing character encoding to \u201C"
434+
+ internalCharset + "\u201D and reparsing.");
435+
characterEncoding = newEnc;
436+
// Note: We intentionally don’t call becomeConfident() at this
437+
// point. If we did, it would end up causing the exception
438+
// java.lang.IllegalStateException: rewind() after willNotRewind()
439+
// to be thrown later. So we are departing here from strictly
440+
// following the ordering in the corresponding spec language, which
441+
// specifies setting the confidence to "certain" at this point.
442+
throw new ReparseException();
394443
} catch (UnsupportedCharsetException e) {
395-
tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C"
396-
+ internalCharset + "\u201D.");
444+
tokenizer.errTreeBuilder(
445+
Encoding.msgBadInternalCharset(internalCharset));
397446
return false;
398447
}
399448
}
@@ -453,8 +502,7 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
453502
}
454503
return whineAboutEncodingAndReturnCanonical(encoding, cs);
455504
} catch (UnsupportedCharsetException e) {
456-
tokenizer.err("Unsupported character encoding name: \u201C" + encoding
457-
+ "\u201D. Will sniff.");
505+
tokenizer.err(Encoding.msgBadEncoding(encoding) + " Will sniff.");
458506
swallowBom = true;
459507
}
460508
return null; // keep the compiler happy
@@ -470,7 +518,7 @@ protected Encoding whineAboutEncodingAndReturnCanonical(String encoding,
470518
Encoding cs) throws SAXException {
471519
String canonName = cs.getCanonName();
472520
if (!canonName.equals(encoding)) {
473-
tokenizer.err(Encoding.msgNotPreferredName(encoding, canonName));
521+
tokenizer.err(Encoding.msgNotCanonicalName(encoding, canonName));
474522
}
475523
return cs;
476524
}

0 commit comments

Comments
 (0)