21
21
* DEALINGS IN THE SOFTWARE.
22
22
*/
23
23
24
+ /*
25
+ * The comments following this one that use the same comment syntax as this
26
+ * comment are quotes from the HTML Standard at https://html.spec.whatwg.org/
27
+ * as of 10 September 2020. That document came with this statement:
28
+ * Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is
29
+ * licensed under a Creative Commons Attribution 4.0 International License.
30
+ */
31
+
24
32
package nu .validator .htmlparser .io ;
25
33
26
34
import java .io .IOException ;
@@ -197,9 +205,8 @@ public void tokenize(InputSource is) throws SAXException, IOException {
197
205
tokenizer .getErrorHandler (), tokenizer , this , heuristics );
198
206
} else {
199
207
if (this .characterEncoding != Encoding .UTF8 ) {
200
- errorWithoutLocation ("Legacy encoding \u201C "
201
- + this .characterEncoding .getCanonName ()
202
- + "\u201D used. Documents must use UTF-8." );
208
+ errorWithoutLocation (Encoding .msgLegacyEncoding (
209
+ this .characterEncoding .getCanonName ()));
203
210
}
204
211
becomeConfident ();
205
212
this .reader = new HtmlInputStreamReader (inputStream ,
@@ -333,50 +340,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) {
333
340
}
334
341
}
335
342
343
+ private void errInternalActualDiffer (String internalCharset , String actual )
344
+ throws SAXException {
345
+ if (!internalCharset .equals (actual )) {
346
+ tokenizer .errTreeBuilder (
347
+ "Ignoring internal encoding declaration \u201C "
348
+ + internalCharset + "\u201D , which disagrees with"
349
+ + " the actual encoding of the document (\u201C "
350
+ + actual + "\u201D )." );
351
+ }
352
+ }
353
+
336
354
public boolean internalEncodingDeclaration (String internalCharset )
337
355
throws SAXException {
356
+ String actual = characterEncoding .getCanonName ();
357
+ if (confidence == Confidence .CERTAIN ) {
358
+ errInternalActualDiffer (internalCharset , actual );
359
+ return true ;
360
+ }
361
+ /* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */
338
362
try {
363
+ if ("utf-16be" .equals (actual ) || "utf-16le" .equals (actual )) {
364
+ errInternalActualDiffer (internalCharset , actual );
365
+ /*
366
+ * 1. If the encoding that is already being used to interpret
367
+ * the input stream is a UTF-16 encoding, then set the
368
+ * confidence to certain and return. The new encoding is ignored
369
+ * becomeConfident();
370
+ */
371
+ return true ;
372
+ }
339
373
internalCharset = internalCharset .toLowerCase ();
340
374
Encoding cs = Encoding .forName (internalCharset );
341
375
if ("utf-16be" .equals (internalCharset )
342
376
|| "utf-16le" .equals (internalCharset )) {
343
- tokenizer .errTreeBuilder ("Internal encoding declaration specified \u201C "
344
- + internalCharset
345
- + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201C utf-8\u201D ." );
377
+ /*
378
+ * 2. If the new encoding is a UTF-16 encoding, then change it
379
+ * to UTF-8.
380
+ */
381
+ tokenizer .errTreeBuilder (
382
+ Encoding .msgIgnoredCharset (internalCharset , "utf-8" ));
346
383
cs = Encoding .UTF8 ;
347
384
internalCharset = "utf-8" ;
348
- } else {
349
- cs = Encoding .forName (internalCharset );
350
- }
351
- Encoding actual = cs .getActualHtmlEncoding ();
352
- if (actual == null ) {
353
- actual = cs ;
385
+ } else if ("x-user-defined" .equals (internalCharset )) {
386
+ /*
387
+ * 3. If the new encoding is x-user-defined, then change it to
388
+ * windows-1252.
389
+ */
390
+ tokenizer .errTreeBuilder (Encoding .msgIgnoredCharset (
391
+ "x-user-defined" , "windows-1252" ));
392
+ cs = Encoding .WINDOWS1252 ;
393
+ internalCharset = "windows-1252" ;
354
394
}
355
395
if (characterEncoding == null ) {
356
396
// Reader case
357
397
return true ;
358
398
}
359
- if (characterEncoding == actual ) {
399
+ if (characterEncoding == cs ) {
400
+ /*
401
+ * 4. If the new encoding is identical or equivalent to the
402
+ * encoding that is already being used to interpret the input
403
+ * stream, then set the confidence to certain and return.
404
+ */
360
405
becomeConfident ();
361
406
return true ;
362
407
}
363
- if (confidence == Confidence .CERTAIN && actual != characterEncoding ) {
364
- tokenizer .errTreeBuilder ("Internal encoding declaration \u201C "
365
- + internalCharset
366
- + "\u201D disagrees with the actual encoding of the document (\u201C "
367
- + characterEncoding .getCanonName () + "\u201D )." );
368
- } else {
369
- Encoding newEnc = whineAboutEncodingAndReturnCanonical (
370
- internalCharset , cs );
371
- tokenizer .errTreeBuilder ("Changing character encoding \u201C "
372
- + internalCharset + "\u201D and reparsing." );
373
- characterEncoding = newEnc ;
374
- throw new ReparseException ();
375
- }
376
- return true ;
408
+ /*
409
+ * 6. Otherwise, navigate to the document again, with
410
+ * historyHandling set to "replace", and using the same source
411
+ * browsing context, but this time skip the encoding sniffing
412
+ * algorithm and instead just set the encoding to the new encoding
413
+ */
414
+ Encoding newEnc = whineAboutEncodingAndReturnCanonical (
415
+ internalCharset , cs );
416
+ tokenizer .errTreeBuilder ("Changing character encoding to \u201C "
417
+ + internalCharset + "\u201D and reparsing." );
418
+ characterEncoding = newEnc ;
419
+ // Note: We intentionally don’t call becomeConfident() at this
420
+ // point. If we did, it would end up causing the exception
421
+ // java.lang.IllegalStateException: rewind() after willNotRewind()
422
+ // to be thrown later. So we are departing here from strictly
423
+ // following the ordering in the corresponding spec language, which
424
+ // specifies setting the confidence to "certain" at this point.
425
+ throw new ReparseException ();
377
426
} catch (UnsupportedCharsetException e ) {
378
- tokenizer .errTreeBuilder ("Internal encoding declaration named an unsupported chararacter encoding \u201C "
379
- + internalCharset + " \u201D ." );
427
+ tokenizer .errTreeBuilder (
428
+ Encoding . msgBadInternalCharset ( internalCharset ) );
380
429
return false ;
381
430
}
382
431
}
@@ -436,8 +485,7 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
436
485
}
437
486
return whineAboutEncodingAndReturnCanonical (encoding , cs );
438
487
} catch (UnsupportedCharsetException e ) {
439
- tokenizer .err ("Unsupported character encoding name: \u201C " + encoding
440
- + "\u201D . Will sniff." );
488
+ tokenizer .err (Encoding .msgBadEncoding (encoding ) + " Will sniff." );
441
489
swallowBom = true ;
442
490
}
443
491
return null ; // keep the compiler happy
@@ -453,7 +501,7 @@ protected Encoding whineAboutEncodingAndReturnCanonical(String encoding,
453
501
Encoding cs ) throws SAXException {
454
502
String canonName = cs .getCanonName ();
455
503
if (!canonName .equals (encoding )) {
456
- tokenizer .err (Encoding .msgNotPreferredName (encoding , canonName ));
504
+ tokenizer .err (Encoding .msgNotCanonicalName (encoding , canonName ));
457
505
}
458
506
return cs ;
459
507
}
0 commit comments