21
21
* DEALINGS IN THE SOFTWARE.
22
22
*/
23
23
24
+ /*
25
+ * The comments following this one that use the same comment syntax as this
26
+ * comment are quotes from the HTML Standard at https://html.spec.whatwg.org/
27
+ * as of 10 September 2020. That document came with this statement:
28
+ * Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is
29
+ * licensed under a Creative Commons Attribution 4.0 International License.
30
+ */
31
+
24
32
package nu .validator .htmlparser .io ;
25
33
26
34
import java .io .IOException ;
@@ -214,9 +222,8 @@ public void tokenize(InputSource is, int bufferSize)
214
222
tokenizer .getErrorHandler (), tokenizer , this , heuristics );
215
223
} else {
216
224
if (this .characterEncoding != Encoding .UTF8 ) {
217
- errorWithoutLocation ("Legacy encoding \u201C "
218
- + this .characterEncoding .getCanonName ()
219
- + "\u201D used. Documents must use UTF-8." );
225
+ errorWithoutLocation (Encoding .msgLegacyEncoding (
226
+ this .characterEncoding .getCanonName ()));
220
227
}
221
228
becomeConfident ();
222
229
this .reader = new HtmlInputStreamReader (inputStream ,
@@ -350,50 +357,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) {
350
357
}
351
358
}
352
359
360
+ private void errInternalActualDiffer (String internalCharset , String actual )
361
+ throws SAXException {
362
+ if (!internalCharset .equals (actual )) {
363
+ tokenizer .errTreeBuilder (
364
+ "Ignoring internal encoding declaration \u201C "
365
+ + internalCharset + "\u201D , which disagrees with"
366
+ + " the actual encoding of the document (\u201C "
367
+ + actual + "\u201D )." );
368
+ }
369
+ }
370
+
353
371
public boolean internalEncodingDeclaration (String internalCharset )
354
372
throws SAXException {
373
+ String actual = characterEncoding .getCanonName ();
374
+ if (confidence == Confidence .CERTAIN ) {
375
+ errInternalActualDiffer (internalCharset , actual );
376
+ return true ;
377
+ }
378
+ /* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */
355
379
try {
380
+ if ("utf-16be" .equals (actual ) || "utf-16le" .equals (actual )) {
381
+ errInternalActualDiffer (internalCharset , actual );
382
+ /*
383
+ * 1. If the encoding that is already being used to interpret
384
+ * the input stream is a UTF-16 encoding, then set the
385
+ * confidence to certain and return. The new encoding is ignored
386
+ * becomeConfident();
387
+ */
388
+ return true ;
389
+ }
356
390
internalCharset = internalCharset .toLowerCase ();
357
391
Encoding cs = Encoding .forName (internalCharset );
358
392
if ("utf-16be" .equals (internalCharset )
359
393
|| "utf-16le" .equals (internalCharset )) {
360
- tokenizer .errTreeBuilder ("Internal encoding declaration specified \u201C "
361
- + internalCharset
362
- + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201C utf-8\u201D ." );
394
+ /*
395
+ * 2. If the new encoding is a UTF-16 encoding, then change it
396
+ * to UTF-8.
397
+ */
398
+ tokenizer .errTreeBuilder (
399
+ Encoding .msgIgnoredCharset (internalCharset , "utf-8" ));
363
400
cs = Encoding .UTF8 ;
364
401
internalCharset = "utf-8" ;
365
- } else {
366
- cs = Encoding .forName (internalCharset );
367
- }
368
- Encoding actual = cs .getActualHtmlEncoding ();
369
- if (actual == null ) {
370
- actual = cs ;
402
+ } else if ("x-user-defined" .equals (internalCharset )) {
403
+ /*
404
+ * 3. If the new encoding is x-user-defined, then change it to
405
+ * windows-1252.
406
+ */
407
+ tokenizer .errTreeBuilder (Encoding .msgIgnoredCharset (
408
+ "x-user-defined" , "windows-1252" ));
409
+ cs = Encoding .WINDOWS1252 ;
410
+ internalCharset = "windows-1252" ;
371
411
}
372
412
if (characterEncoding == null ) {
373
413
// Reader case
374
414
return true ;
375
415
}
376
- if (characterEncoding == actual ) {
416
+ if (characterEncoding == cs ) {
417
+ /*
418
+ * 4. If the new encoding is identical or equivalent to the
419
+ * encoding that is already being used to interpret the input
420
+ * stream, then set the confidence to certain and return.
421
+ */
377
422
becomeConfident ();
378
423
return true ;
379
424
}
380
- if (confidence == Confidence .CERTAIN && actual != characterEncoding ) {
381
- tokenizer .errTreeBuilder ("Internal encoding declaration \u201C "
382
- + internalCharset
383
- + "\u201D disagrees with the actual encoding of the document (\u201C "
384
- + characterEncoding .getCanonName () + "\u201D )." );
385
- } else {
386
- Encoding newEnc = whineAboutEncodingAndReturnCanonical (
387
- internalCharset , cs );
388
- tokenizer .errTreeBuilder ("Changing character encoding \u201C "
389
- + internalCharset + "\u201D and reparsing." );
390
- characterEncoding = newEnc ;
391
- throw new ReparseException ();
392
- }
393
- return true ;
425
+ /*
426
+ * 6. Otherwise, navigate to the document again, with
427
+ * historyHandling set to "replace", and using the same source
428
+ * browsing context, but this time skip the encoding sniffing
429
+ * algorithm and instead just set the encoding to the new encoding
430
+ */
431
+ Encoding newEnc = whineAboutEncodingAndReturnCanonical (
432
+ internalCharset , cs );
433
+ tokenizer .errTreeBuilder ("Changing character encoding to \u201C "
434
+ + internalCharset + "\u201D and reparsing." );
435
+ characterEncoding = newEnc ;
436
+ // Note: We intentionally don’t call becomeConfident() at this
437
+ // point. If we did, it would end up causing the exception
438
+ // java.lang.IllegalStateException: rewind() after willNotRewind()
439
+ // to be thrown later. So we are departing here from strictly
440
+ // following the ordering in the corresponding spec language, which
441
+ // specifies setting the confidence to "certain" at this point.
442
+ throw new ReparseException ();
394
443
} catch (UnsupportedCharsetException e ) {
395
- tokenizer .errTreeBuilder ("Internal encoding declaration named an unsupported chararacter encoding \u201C "
396
- + internalCharset + " \u201D ." );
444
+ tokenizer .errTreeBuilder (
445
+ Encoding . msgBadInternalCharset ( internalCharset ) );
397
446
return false ;
398
447
}
399
448
}
@@ -453,8 +502,7 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
453
502
}
454
503
return whineAboutEncodingAndReturnCanonical (encoding , cs );
455
504
} catch (UnsupportedCharsetException e ) {
456
- tokenizer .err ("Unsupported character encoding name: \u201C " + encoding
457
- + "\u201D . Will sniff." );
505
+ tokenizer .err (Encoding .msgBadEncoding (encoding ) + " Will sniff." );
458
506
swallowBom = true ;
459
507
}
460
508
return null ; // keep the compiler happy
@@ -470,7 +518,7 @@ protected Encoding whineAboutEncodingAndReturnCanonical(String encoding,
470
518
Encoding cs ) throws SAXException {
471
519
String canonName = cs .getCanonName ();
472
520
if (!canonName .equals (encoding )) {
473
- tokenizer .err (Encoding .msgNotPreferredName (encoding , canonName ));
521
+ tokenizer .err (Encoding .msgNotCanonicalName (encoding , canonName ));
474
522
}
475
523
return cs ;
476
524
}
0 commit comments