From 208fd514f61a926b5a19bd4092a2e56af10b624a Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 12 Sep 2020 20:39:25 +0900 Subject: [PATCH 01/15] =?UTF-8?q?Private=20encodingByCookedName=20?= =?UTF-8?q?=E2=86=92=20encodingByLabel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change renames the private encodingByCookedName map in the Encoding class to encodingByLabel (for consistency with Encoding spec terminology). --- src/nu/validator/htmlparser/io/Encoding.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index 123465f8..282c34e8 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -63,6 +63,8 @@ public class Encoding { "xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom", "xutf32lebom", "xutf16oppositeendian", "xutf16platformendian", "xutf32oppositeendian", "xutf32platformendian" }; + private static Map encodingByLabel = + new HashMap(); /* From the table at https://encoding.spec.whatwg.org/#names-and-labels, * everything in the Labels column, sorted */ @@ -339,14 +341,14 @@ asciiSuperset, isObscure(name), encodings.add(enc); Set aliases = cs.aliases(); for (String alias : aliases) { - encodingByCookedName.put(toNameKey(alias).intern(), enc); + encodingByLabel.put(toNameKey(alias).intern(), enc); } } } // Overwrite possible overlapping aliases with the real things--just in // case for (Encoding encoding : encodings) { - encodingByCookedName.put(toNameKey(encoding.getCanonName()), + encodingByLabel.put(toNameKey(encoding.getCanonName()), encoding); } UTF8 = forName("utf-8"); @@ -383,15 +385,15 @@ asciiSuperset, isObscure(name), } catch (UnsupportedCharsetException e) { } try { - encodingByCookedName.put("x-x-big5", forName("big5")); + encodingByLabel.put("x-x-big5", forName("big5")); } catch (UnsupportedCharsetException e) { } try { - encodingByCookedName.put("euc-kr", forName("windows-949")); + encodingByLabel.put("euc-kr", forName("windows-949")); } catch (UnsupportedCharsetException e) { } try { - encodingByCookedName.put("ks_c_5601-1987", forName("windows-949")); + encodingByLabel.put("ks_c_5601-1987", forName("windows-949")); } catch (UnsupportedCharsetException e) { } } @@ -459,7 +461,7 @@ private static boolean isLikelyEbcdic(String canonName, } public static Encoding forName(String name) { - Encoding rv = encodingByCookedName.get(toNameKey(name)); + Encoding rv = encodingByLabel.get(toNameKey(name)); if (rv == null) { throw new UnsupportedCharsetException(name); } else { @@ -617,7 +619,7 @@ public Encoding getActualHtmlEncoding() { } public static void main(String[] args) { - for (Map.Entry entry : encodingByCookedName.entrySet()) { + for (Map.Entry entry : encodingByLabel.entrySet()) { String name = entry.getKey(); Encoding enc = entry.getValue(); System.out.printf( From eab36025dded2cc7a9dee75e32c003ce35cbf8b0 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 12 Sep 2020 20:59:16 +0900 Subject: [PATCH 02/15] =?UTF-8?q?Drop/replace=20=E2=80=9Cis=20ASCII=20supe?= =?UTF-8?q?rset=E2=80=9D-checking=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change drops or replaces all code for checking whether a particular encoding is an ASCII superset — because the code no longer corresponds to actual requirements in the Encoding spec (which instead now requires checking only whether an encoding is utf-16be or utf-16le). --- .../htmlparser/extra/ChardetSniffer.java | 4 +- .../htmlparser/extra/IcuDetectorSniffer.java | 3 +- src/nu/validator/htmlparser/io/Driver.java | 6 -- src/nu/validator/htmlparser/io/Encoding.java | 58 ------------------- .../validator/htmlparser/io/MetaSniffer.java | 6 -- 5 files changed, 5 insertions(+), 72 deletions(-) diff --git a/src/nu/validator/htmlparser/extra/ChardetSniffer.java b/src/nu/validator/htmlparser/extra/ChardetSniffer.java index a7575039..f4eb3852 100644 --- a/src/nu/validator/htmlparser/extra/ChardetSniffer.java +++ b/src/nu/validator/htmlparser/extra/ChardetSniffer.java @@ -54,7 +54,9 @@ public Encoding sniff() throws IOException { detector.Init(this); detector.DoIt(source, length, false); detector.DataEnd(); - if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) { + if (returnValue != null && returnValue != Encoding.WINDOWS1252 + && returnValue != Encoding.UTF16BE + && returnValue != Encoding.UTF16LE) { return returnValue; } else { return null; diff --git a/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java b/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java index f3caab5c..9b8cfa0d 100644 --- a/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java +++ b/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java @@ -57,7 +57,8 @@ public Encoding sniff() throws IOException { if (actual != null) { enc = actual; } - if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { + if (enc != Encoding.WINDOWS1252 // + && enc != Encoding.UTF16BE && enc != Encoding.UTF16LE) { return enc; } else { return null; diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java index a8dd387c..afde98df 100644 --- a/src/nu/validator/htmlparser/io/Driver.java +++ b/src/nu/validator/htmlparser/io/Driver.java @@ -370,12 +370,6 @@ public boolean internalEncodingDeclaration(String internalCharset) if (actual == null) { actual = cs; } - if (!actual.isAsciiSuperset()) { - tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" - + internalCharset - + "\u201D which is not an ASCII superset. Not changing the encoding."); - return false; - } if (characterEncoding == null) { // Reader case return true; diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index 282c34e8..2e518fb8 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -304,8 +304,6 @@ public class Encoding { private final Charset charset; - private final boolean asciiSuperset; - private final boolean obscure; private final boolean shouldNot; @@ -315,15 +313,6 @@ public class Encoding { private Encoding actualHtmlEncoding = null; static { - byte[] testBuf = new byte[0x7F]; - for (int i = 0; i < 0x7F; i++) { - if (isAsciiSupersetnessSensitive(i)) { - testBuf[i] = (byte) i; - } else { - testBuf[i] = (byte) 0x20; - } - } - Set encodings = new HashSet(); SortedMap charsets = Charset.availableCharsets(); @@ -398,12 +387,6 @@ asciiSuperset, isObscure(name), } } - private static boolean isAsciiSupersetnessSensitive(int c) { - return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22) - || (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F) - || (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A); - } - private static boolean isObscure(String lowerCasePreferredIanaName) { return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1); } @@ -419,38 +402,6 @@ private static boolean isShouldNot(String lowerCasePreferredIanaName) { return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1); } - /** - * @param testBuf - * @param cs - */ - private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) { - CharsetDecoder dec = cs.newDecoder(); - dec.onMalformedInput(CodingErrorAction.REPORT); - dec.onUnmappableCharacter(CodingErrorAction.REPORT); - Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec); - try { - for (int i = 0; i < 0x7F; i++) { - if (isAsciiSupersetnessSensitive(i)) { - if (r.read() != i) { - return false; - } - } else { - if (r.read() != 0x20) { - return false; - } - } - } - } catch (IOException e) { - return false; - } catch (Exception e) { - return false; - } catch (CoderMalfunctionError e) { - return false; - } - - return true; - } - private static boolean isLikelyEbcdic(String canonName, boolean asciiSuperset) { if (!asciiSuperset) { @@ -536,15 +487,6 @@ private Encoding(final String canonName, final Charset charset, this.likelyEbcdic = likelyEbcdic; } - /** - * Returns the asciiSuperset. - * - * @return the asciiSuperset - */ - public boolean isAsciiSuperset() { - return asciiSuperset; - } - /** * Returns the canonName. * diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java index 9deaef7a..acbd893e 100755 --- a/src/nu/validator/htmlparser/io/MetaSniffer.java +++ b/src/nu/validator/htmlparser/io/MetaSniffer.java @@ -169,12 +169,6 @@ protected boolean tryCharset(String encoding) throws SAXException { } else { Encoding cs = Encoding.forName(encoding); String canonName = cs.getCanonName(); - if (!cs.isAsciiSuperset()) { - err("The encoding \u201C" - + encoding - + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm."); - return false; - } if (!cs.isRegistered()) { if (encoding.startsWith("x-")) { err("The encoding \u201C" From 6413b441d228c3362bcd7714a431949fbc1abd7e Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 12 Sep 2020 22:48:30 +0900 Subject: [PATCH 03/15] Add shared error-message strings to Encoding class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds some error-message strings to the Encoding class — for shared used by the htmlparser.io.Driver and htmlparser.io.MetaSniffer classes, which need to emit the same error messages in a number of cases. --- src/nu/validator/htmlparser/io/Encoding.java | 25 ++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index 2e518fb8..65583903 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -560,6 +560,31 @@ public Encoding getActualHtmlEncoding() { return actualHtmlEncoding; } + protected static String msgLegacyEncoding(String name) { + return "Legacy encoding \u201C" + name + "\u201D used. Documents must" + + " use UTF-8."; + } + + protected static String msgIgnoredCharset(String ignored, String name) { + return "Internal encoding declaration specified \u201C" + ignored + + "\u201D. Continuing as if the encoding had been \u201C" + + name + "\u201D."; + } + protected static String msgNotCanonicalName(String label, String name) { + return "The encoding \u201C" + label + "\u201D is not the canonical" + + " name of the character encoding in use. The canonical name" + + " is \u201C" + name + "\u201D. (Charmod C024)"; + } + + protected static String msgBadInternalCharset(String internalCharset) { + return "Internal encoding declaration named an unsupported character" + + " encoding \u201C" + internalCharset + "\u201D."; + } + + protected static String msgBadEncoding(String name) { + return "Unsupported character encoding name: \u201C" + name + "\u201D."; + } + public static void main(String[] args) { for (Map.Entry entry : encodingByLabel.entrySet()) { String name = entry.getKey(); From ff4258b73656a3c266180d4f08c41fdd7fb5fe7b Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 12 Sep 2020 21:07:44 +0900 Subject: [PATCH 04/15] =?UTF-8?q?Drop=20=E2=80=9Cactual=20HTML=20encoding?= =?UTF-8?q?=E2=80=9D-related=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change removes all code related to checking and using the “actual HTML encoding”, which no longer corresponds to current spec requirements. --- .../htmlparser/extra/ChardetSniffer.java | 4 -- .../htmlparser/extra/IcuDetectorSniffer.java | 4 -- src/nu/validator/htmlparser/io/Driver.java | 11 ++-- src/nu/validator/htmlparser/io/Encoding.java | 51 ------------------- .../validator/htmlparser/io/MetaSniffer.java | 4 +- 5 files changed, 5 insertions(+), 69 deletions(-) diff --git a/src/nu/validator/htmlparser/extra/ChardetSniffer.java b/src/nu/validator/htmlparser/extra/ChardetSniffer.java index f4eb3852..4f6c5826 100644 --- a/src/nu/validator/htmlparser/extra/ChardetSniffer.java +++ b/src/nu/validator/htmlparser/extra/ChardetSniffer.java @@ -74,10 +74,6 @@ public static void main(String[] args) { public void Notify(String charsetName) { try { Encoding enc = Encoding.forName(charsetName); - Encoding actual = enc.getActualHtmlEncoding(); - if (actual != null) { - enc = actual; - } returnValue = enc; } catch (UnsupportedCharsetException e) { returnValue = null; diff --git a/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java b/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java index 9b8cfa0d..7aa0dde0 100644 --- a/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java +++ b/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java @@ -53,10 +53,6 @@ public Encoding sniff() throws IOException { detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); - Encoding actual = enc.getActualHtmlEncoding(); - if (actual != null) { - enc = actual; - } if (enc != Encoding.WINDOWS1252 // && enc != Encoding.UTF16BE && enc != Encoding.UTF16LE) { return enc; diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java index afde98df..8bab457f 100644 --- a/src/nu/validator/htmlparser/io/Driver.java +++ b/src/nu/validator/htmlparser/io/Driver.java @@ -497,15 +497,10 @@ protected Encoding whineAboutEncodingAndReturnActual(String encoding, + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); } - Encoding actual = cs.getActualHtmlEncoding(); - if (actual == null) { - return cs; - } else { - tokenizer.warn("Using \u201C" + actual.getCanonName() - + "\u201D instead of the declared encoding \u201C" - + encoding + "\u201D."); - return actual; + if (!canonName.equals(encoding)) { + tokenizer.err(Encoding.msgNotPreferredName(encoding, canonName)); } + return cs; } private class ReparseException extends SAXException { diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index 65583903..e7109893 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -310,8 +310,6 @@ public class Encoding { private final boolean likelyEbcdic; - private Encoding actualHtmlEncoding = null; - static { Set encodings = new HashSet(); @@ -345,46 +343,6 @@ asciiSuperset, isObscure(name), UTF16BE = forName("utf-16be"); UTF16LE = forName("utf-16le"); WINDOWS1252 = forName("windows-1252"); - try { - forName("iso-8859-1").actualHtmlEncoding = forName("windows-1252"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("iso-8859-9").actualHtmlEncoding = forName("windows-1254"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("iso-8859-11").actualHtmlEncoding = forName("windows-874"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("x-iso-8859-11").actualHtmlEncoding = forName("windows-874"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("tis-620").actualHtmlEncoding = forName("windows-874"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("gb_2312-80").actualHtmlEncoding = forName("gbk"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("gb2312").actualHtmlEncoding = forName("gbk"); - } catch (UnsupportedCharsetException e) { - } - try { - encodingByLabel.put("x-x-big5", forName("big5")); - } catch (UnsupportedCharsetException e) { - } - try { - encodingByLabel.put("euc-kr", forName("windows-949")); - } catch (UnsupportedCharsetException e) { - } - try { - encodingByLabel.put("ks_c_5601-1987", forName("windows-949")); - } catch (UnsupportedCharsetException e) { - } } private static boolean isObscure(String lowerCasePreferredIanaName) { @@ -551,15 +509,6 @@ public CharsetEncoder newEncoder() { return charset.newEncoder(); } - /** - * Returns the actualHtmlEncoding. - * - * @return the actualHtmlEncoding - */ - public Encoding getActualHtmlEncoding() { - return actualHtmlEncoding; - } - protected static String msgLegacyEncoding(String name) { return "Legacy encoding \u201C" + name + "\u201D used. Documents must" + " use UTF-8."; diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java index acbd893e..e8e7018e 100755 --- a/src/nu/validator/htmlparser/io/MetaSniffer.java +++ b/src/nu/validator/htmlparser/io/MetaSniffer.java @@ -191,8 +191,8 @@ protected boolean tryCharset(String encoding) throws SAXException { } else if (cs.isObscure()) { warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); } - Encoding actual = cs.getActualHtmlEncoding(); - if (actual == null) { + if (!cs.getCanonName().equals(encoding)) { + err(Encoding.msgNotCanonicalName(encoding, canonName)); this.characterEncoding = cs; } else { warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D."); From 24538913bedd0259cfa19075160dadc822bb6d1c Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 12 Sep 2020 20:51:51 +0900 Subject: [PATCH 05/15] Drop superseded encoding-checking code This change drops some code which performs various encoding checks that no longer correspond to any current requirements in the Encoding spec. --- src/nu/validator/htmlparser/io/Driver.java | 27 ------- src/nu/validator/htmlparser/io/Encoding.java | 72 ------------------- .../validator/htmlparser/io/MetaSniffer.java | 22 ------ 3 files changed, 121 deletions(-) diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java index 8bab457f..af546075 100644 --- a/src/nu/validator/htmlparser/io/Driver.java +++ b/src/nu/validator/htmlparser/io/Driver.java @@ -470,33 +470,6 @@ protected Encoding encodingFromExternalDeclaration(String encoding) protected Encoding whineAboutEncodingAndReturnActual(String encoding, Encoding cs) throws SAXException { String canonName = cs.getCanonName(); - if (!cs.isRegistered()) { - if (encoding.startsWith("x-")) { - tokenizer.err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding. (Charmod C022)"); - } else { - tokenizer.err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)"); - } - } else if (!canonName.equals(encoding)) { - tokenizer.err("The encoding \u201C" - + encoding - + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" - + canonName + "\u201D. (Charmod C024)"); - } - if (cs.isShouldNot()) { - tokenizer.warn("Authors should not use the character encoding \u201C" - + encoding - + "\u201D. It is recommended to use \u201CUTF-8\u201D."); - } else if (cs.isLikelyEbcdic()) { - tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D."); - } else if (cs.isObscure()) { - tokenizer.warn("The character encoding \u201C" - + encoding - + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); - } if (!canonName.equals(encoding)) { tokenizer.err(Encoding.msgNotPreferredName(encoding, canonName)); } diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index e7109893..20a9e239 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -52,17 +52,6 @@ public class Encoding { public static final Encoding WINDOWS1252; - private static String[] SHOULD_NOT = { "jisx02121990", "xjis0208" }; - - private static String[] BANNED = { "bocu1", "cesu8", "compoundtext", - "iscii91", "macarabic", "maccentraleurroman", "maccroatian", - "maccyrillic", "macdevanagari", "macfarsi", "macgreek", - "macgujarati", "macgurmukhi", "machebrew", "macicelandic", - "macroman", "macromanian", "macthai", "macturkish", "macukranian", - "scsu", "utf32", "utf32be", "utf32le", "utf7", "ximapmailboxname", - "xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom", - "xutf32lebom", "xutf16oppositeendian", "xutf16platformendian", - "xutf32oppositeendian", "xutf32platformendian" }; private static Map encodingByLabel = new HashMap(); @@ -304,12 +293,6 @@ public class Encoding { private final Charset charset; - private final boolean obscure; - - private final boolean shouldNot; - - private final boolean likelyEbcdic; - static { Set encodings = new HashSet(); @@ -345,30 +328,6 @@ asciiSuperset, isObscure(name), WINDOWS1252 = forName("windows-1252"); } - private static boolean isObscure(String lowerCasePreferredIanaName) { - return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1); - } - - private static boolean isBanned(String lowerCasePreferredIanaName) { - if (lowerCasePreferredIanaName.startsWith("xibm")) { - return true; - } - return (Arrays.binarySearch(BANNED, lowerCasePreferredIanaName) > -1); - } - - private static boolean isShouldNot(String lowerCasePreferredIanaName) { - return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1); - } - - private static boolean isLikelyEbcdic(String canonName, - boolean asciiSuperset) { - if (!asciiSuperset) { - return (canonName.startsWith("cp") || canonName.startsWith("ibm") || canonName.startsWith("xibm")); - } else { - return false; - } - } - public static Encoding forName(String name) { Encoding rv = encodingByLabel.get(toNameKey(name)); if (rv == null) { @@ -454,37 +413,6 @@ public String getCanonName() { return canonName; } - /** - * Returns the likelyEbcdic. - * - * @return the likelyEbcdic - */ - public boolean isLikelyEbcdic() { - return likelyEbcdic; - } - - /** - * Returns the obscure. - * - * @return the obscure - */ - public boolean isObscure() { - return obscure; - } - - /** - * Returns the shouldNot. - * - * @return the shouldNot - */ - public boolean isShouldNot() { - return shouldNot; - } - - public boolean isRegistered() { - return !canonName.startsWith("x-"); - } - /** * @return * @see java.nio.charset.Charset#canEncode() diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java index e8e7018e..600c1e72 100755 --- a/src/nu/validator/htmlparser/io/MetaSniffer.java +++ b/src/nu/validator/htmlparser/io/MetaSniffer.java @@ -169,28 +169,6 @@ protected boolean tryCharset(String encoding) throws SAXException { } else { Encoding cs = Encoding.forName(encoding); String canonName = cs.getCanonName(); - if (!cs.isRegistered()) { - if (encoding.startsWith("x-")) { - err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding. (Charmod C022)"); - } else { - err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)"); - } - } else if (!cs.getCanonName().equals(encoding)) { - err("The encoding \u201C" + encoding - + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" - + canonName + "\u201D. (Charmod C024)"); - } - if (cs.isShouldNot()) { - warn("Authors should not use the character encoding \u201C" - + encoding - + "\u201D. It is recommended to use \u201CUTF-8\u201D."); - } else if (cs.isObscure()) { - warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); - } if (!cs.getCanonName().equals(encoding)) { err(Encoding.msgNotCanonicalName(encoding, canonName)); this.characterEncoding = cs; From 1b06e4fa9f7434d681cf6b4a62ba443d2d11cd3a Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 12 Sep 2020 21:15:27 +0900 Subject: [PATCH 06/15] Drop obsoleted params from Encoding constructor --- src/nu/validator/htmlparser/io/Encoding.java | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index 20a9e239..c85f8365 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -388,20 +388,10 @@ public static String toAsciiLowerCase(String str) { /** * @param canonName * @param charset - * @param asciiSuperset - * @param obscure - * @param shouldNot - * @param likelyEbcdic */ - private Encoding(final String canonName, final Charset charset, - final boolean asciiSuperset, final boolean obscure, - final boolean shouldNot, final boolean likelyEbcdic) { + private Encoding(final String canonName, final Charset charset) { this.canonName = canonName; this.charset = charset; - this.asciiSuperset = asciiSuperset; - this.obscure = obscure; - this.shouldNot = shouldNot; - this.likelyEbcdic = likelyEbcdic; } /** @@ -466,11 +456,7 @@ public static void main(String[] args) { for (Map.Entry entry : encodingByLabel.entrySet()) { String name = entry.getKey(); Encoding enc = entry.getValue(); - System.out.printf( - "%21s: canon %21s, obs %5s, reg %5s, asc %5s, ebc %5s\n", - name, enc.getCanonName(), enc.isObscure(), - enc.isRegistered(), enc.isAsciiSuperset(), - enc.isLikelyEbcdic()); + System.out.printf("%21s: canon %13s\n", name, enc.getCanonName()); } } From 9836218f9e7bedca44b39063646ae662d8317de8 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 12 Sep 2020 20:25:30 +0900 Subject: [PATCH 07/15] Conform supported encodings to Encoding spec This change conforms the set of supported encodings for the parser to the requirements in the Encoding spec. Specifically, this restricts the set of encoding names and labels to those listed in the table at https://encoding.spec.whatwg.org/#names-and-labels, and the statement: > The table below lists all encodings and their labels user agents must > support. User agents must not support any other encodings or labels. --- src/nu/validator/htmlparser/io/Encoding.java | 589 +++++++++++-------- 1 file changed, 329 insertions(+), 260 deletions(-) diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index c85f8365..83695dc5 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -55,272 +55,341 @@ public class Encoding { private static Map encodingByLabel = new HashMap(); - /* From the table at https://encoding.spec.whatwg.org/#names-and-labels, - * everything in the Labels column, sorted */ - private static String[] NOT_OBSCURE = { // - "866", // - "ansi_x3.4-1968", // - "arabic", // - "ascii", // - "asmo-708", // - "big5", // - "big5-hkscs", // - "chinese", // - "cn-big5", // - "cp1250", // - "cp1251", // - "cp1252", // - "cp1253", // - "cp1254", // - "cp1255", // - "cp1256", // - "cp1257", // - "cp1258", // - "cp819", // - "cp866", // - "csbig5", // - "cseuckr", // - "cseucpkdfmtjapanese", // - "csgb2312", // - "csibm866", // - "csiso2022jp", // - "csiso2022kr", // - "csiso58gb231280", // - "csiso88596e", // - "csiso88596i", // - "csiso88598e", // - "csiso88598i", // - "csisolatin1", // - "csisolatin2", // - "csisolatin3", // - "csisolatin4", // - "csisolatin5", // - "csisolatin6", // - "csisolatin9", // - "csisolatinarabic", // - "csisolatincyrillic", // - "csisolatingreek", // - "csisolatinhebrew", // - "cskoi8r", // - "csksc56011987", // - "csmacintosh", // - "csshiftjis", // - "csunicode", // - "cyrillic", // - "dos-874", // - "ecma-114", // - "ecma-118", // - "elot_928", // - "euc-jp", // - "euc-kr", // - "gb18030", // - "gb2312", // - "gb_2312", // - "gb_2312-80", // - "gbk", // - "greek", // - "greek8", // - "hebrew", // - "hz-gb-2312", // - "ibm819", // - "ibm866", // - "iso-10646-ucs-2", // - "iso-2022-cn", // - "iso-2022-cn-ext", // - "iso-2022-jp", // - "iso-2022-kr", // - "iso-8859-1", // - "iso-8859-10", // - "iso-8859-11", // - "iso-8859-13", // - "iso-8859-14", // - "iso-8859-15", // - "iso-8859-16", // - "iso-8859-2", // - "iso-8859-3", // - "iso-8859-4", // - "iso-8859-5", // - "iso-8859-6", // - "iso-8859-6-e", // - "iso-8859-6-i", // - "iso-8859-7", // - "iso-8859-8", // - "iso-8859-8-e", // - "iso-8859-8-i", // - "iso-8859-9", // - "iso-ir-100", // - "iso-ir-101", // - "iso-ir-109", // - "iso-ir-110", // - "iso-ir-126", // - "iso-ir-127", // - "iso-ir-138", // - "iso-ir-144", // - "iso-ir-148", // - "iso-ir-149", // - "iso-ir-157", // - "iso-ir-58", // - "iso8859-1", // - "iso8859-10", // - "iso8859-11", // - "iso8859-13", // - "iso8859-14", // - "iso8859-15", // - "iso8859-2", // - "iso8859-3", // - "iso8859-4", // - "iso8859-5", // - "iso8859-6", // - "iso8859-7", // - "iso8859-8", // - "iso8859-9", // - "iso88591", // - "iso885910", // - "iso885911", // - "iso885913", // - "iso885914", // - "iso885915", // - "iso88592", // - "iso88593", // - "iso88594", // - "iso88595", // - "iso88596", // - "iso88597", // - "iso88598", // - "iso88599", // - "iso_8859-1", // - "iso_8859-15", // - "iso_8859-1:1987", // - "iso_8859-2", // - "iso_8859-2:1987", // - "iso_8859-3", // - "iso_8859-3:1988", // - "iso_8859-4", // - "iso_8859-4:1988", // - "iso_8859-5", // - "iso_8859-5:1988", // - "iso_8859-6", // - "iso_8859-6:1987", // - "iso_8859-7", // - "iso_8859-7:1987", // - "iso_8859-8", // - "iso_8859-8:1988", // - "iso_8859-9", // - "iso_8859-9:1989", // - "koi", // - "koi8", // - "koi8-r", // - "koi8-ru", // - "koi8-u", // - "koi8_r", // - "korean", // - "ks_c_5601-1987", // - "ks_c_5601-1989", // - "ksc5601", // - "ksc_5601", // - "l1", // - "l2", // - "l3", // - "l4", // - "l5", // - "l6", // - "l9", // - "latin1", // - "latin2", // - "latin3", // - "latin4", // - "latin5", // - "latin6", // - "logical", // - "mac", // - "macintosh", // - "ms932", // - "ms_kanji", // - "replacement", // - "shift-jis", // - "shift_jis", // - "sjis", // - "sun_eu_greek", // - "tis-620", // - "ucs-2", // - "unicode", // - "unicode-1-1-utf-8", // - "unicode11utf8", // - "unicode20utf8", // - "unicodefeff", // - "unicodefffe", // - "us-ascii", // - "utf-16", // - "utf-16be", // - "utf-16le", // - "utf-8", // - "utf8", // - "visual", // - "windows-1250", // - "windows-1251", // - "windows-1252", // - "windows-1253", // - "windows-1254", // - "windows-1255", // - "windows-1256", // - "windows-1257", // - "windows-1258", // - "windows-31j", // - "windows-874", // - "windows-949", // - "x-cp1250", // - "x-cp1251", // - "x-cp1252", // - "x-cp1253", // - "x-cp1254", // - "x-cp1255", // - "x-cp1256", // - "x-cp1257", // - "x-cp1258", // - "x-euc-jp", // - "x-gbk", // - "x-mac-cyrillic", // - "x-mac-roman", // - "x-mac-ukrainian", // - "x-sjis", // - "x-unicode20utf8", // - "x-user-defined", // - "x-x-big5", // - }; - private static Map encodingByCookedName = new HashMap(); + private static void createEncoding(String name, String[] labels) { + if (!Charset.isSupported(name)) { + return; + } + Charset cs = Charset.forName(name); + Encoding enc = new Encoding(name.toLowerCase().intern(), cs); + for (String label : labels) { + encodingByLabel.put(label, enc); + } + } + + static { + /* See https://encoding.spec.whatwg.org/#names-and-labels */ + createEncoding( // + "UTF-8", new String[] { // + "unicode-1-1-utf-8", // + "unicode11utf8", // + "unicode20utf8", // + "utf-8", // + "utf8", // + "x-unicode20utf8" }); + createEncoding( // + "IBM866", new String[] { // + "866", // + "cp866", // + "csibm866", // + "ibm866" }); + createEncoding( // + "ISO-8859-2", new String[] { // + "csisolatin2", // + "iso-8859-2", // + "iso-ir-101", // + "iso8859-2", // + "iso88592", // + "iso_8859-2", // + "iso_8859-2:1987", // + "l2", // + "latin2" }); + createEncoding( // + "ISO-8859-3", new String[] { // + "csisolatin3", // + "iso-8859-3", // + "iso-ir-109", // + "iso8859-3", // + "iso88593", // + "iso_8859-3", // + "iso_8859-3:1988", // + "l3", // + "latin3" }); + createEncoding( // + "ISO-8859-4", new String[] { // + "csisolatin4", // + "iso-8859-4", // + "iso-ir-110", // + "iso8859-4", // + "iso88594", // + "iso_8859-4", // + "iso_8859-4:1988", // + "l4", // + "latin4" }); + createEncoding( // + "ISO-8859-5", new String[] { // + "csisolatincyrillic", // + "cyrillic", // + "iso-8859-5", // + "iso-ir-144", // + "iso8859-5", // + "iso88595", // + "iso_8859-5", // + "iso_8859-5:1988" }); + createEncoding( // + "ISO-8859-6", new String[] { // + "arabic", // + "asmo-708", // + "csiso88596e", // + "csiso88596i", // + "csisolatinarabic", // + "ecma-114", // + "iso-8859-6", // + "iso-8859-6-e", // + "iso-8859-6-i", // + "iso-ir-127", // + "iso8859-6", // + "iso88596", // + "iso_8859-6", // + "iso_8859-6:1987" }); + createEncoding( // + "ISO-8859-7", new String[] { // + "csisolatingreek", // + "ecma-118", // + "elot_928", // + "greek", // + "greek8", // + "iso-8859-7", // + "iso-ir-126", // + "iso8859-7", // + "iso88597", // + "iso_8859-7", // + "iso_8859-7:1987", // + "sun_eu_greek" }); + createEncoding( // + "ISO-8859-8", new String[] { // + "csiso88598e", // + "csisolatinhebrew", // + "hebrew", // + "iso-8859-8", // + "iso-8859-8-e", // + "iso-ir-138", // + "iso8859-8", // + "iso88598", // + "iso_8859-8", // + "iso_8859-8:1988", // + "visual" }); + createEncoding( // + // Unsupported in Java + "ISO-8859-8-I", new String[] { // + "csiso88598i", // + "iso-8859-8-i", // + "logical" }); + createEncoding( // + // Unsupported in Java + "ISO-8859-10", new String[] { // + "csisolatin6", // + "iso-8859-10", // + "iso-ir-157", // + "iso8859-10", // + "iso885910", // + "l6", // + "latin6" }); + createEncoding( // + "ISO-8859-13", new String[] { // + "iso-8859-13", // + "iso8859-13", // + "iso885913" }); + createEncoding( // + // Unsupported in Java + "ISO-8859-14", new String[] { // + "iso-8859-14", // + "iso8859-14", // + "iso885914" }); + createEncoding( // + "ISO-8859-15", new String[] { // + "csisolatin9", // + "iso-8859-15", // + "iso8859-15", // + "iso885915", // + "iso_8859-15", // + "l9" }); + createEncoding( // + "ISO-8859-16", new String[] { // + "iso-8859-16" }); + createEncoding( // + "KOI8-R", new String[] { // + "cskoi8r", // + "koi", // + "koi8", // + "koi8-r", // + "koi8_r" }); + createEncoding( // + "KOI8-U", new String[] { // + "koi8-ru", // + "koi8-u" }); + createEncoding( // + // Unsupported in Java + "macintosh", new String[] { // + "csmacintosh", // + "mac", // + "macintosh", // + "x-mac-roman" }); + createEncoding( // + "windows-874", new String[] { // + "dos-874", // + "iso-8859-11", // + "iso8859-11", // + "iso885911", // + "tis-620", // + "windows-874" }); + createEncoding( // + "windows-1250", new String[] { // + "cp1250", // + "windows-1250", // + "x-cp1250" }); + createEncoding( // + "windows-1251", new String[] { // + "cp1251", // + "windows-1251", // + "x-cp1251" }); + createEncoding( // + "windows-1252", new String[] { // + "ansi_x3.4-1968", // + "ascii", // + "cp1252", // + "cp819", // + "csisolatin1", // + "ibm819", // + "iso-8859-1", // + "iso-ir-100", // + "iso8859-1", // + "iso88591", // + "iso_8859-1", // + "iso_8859-1:1987", // + "l1", // + "latin1", // + "us-ascii", // + "windows-1252", // + "x-cp1252" }); + createEncoding( // + "windows-1253", new String[] { // + "cp1253", // + "windows-1253", // + "x-cp1253" }); + createEncoding( // + "windows-1254", new String[] { // + "cp1254", // + "csisolatin5", // + "iso-8859-9", // + "iso-ir-148", // + "iso8859-9", // + "iso88599", // + "iso_8859-9", // + "iso_8859-9:1989", // + "l5", // + "latin5", // + "windows-1254", // + "x-cp1254" }); + createEncoding( // + "windows-1255", new String[] { // + "cp1255", // + "windows-1255", // + "x-cp1255" }); + createEncoding( // + "windows-1256", new String[] { // + "cp1256", // + "windows-1256", // + "x-cp1256" }); + createEncoding( // + "windows-1257", new String[] { // + "cp1257", // + "windows-1257", // + "x-cp1257" }); + createEncoding( // + "windows-1258", new String[] { // + "cp1258", // + "windows-1258", // + "x-cp1258" }); + createEncoding( // + // Unsupported in Java + "x-mac-cyrillic", new String[] { // + "x-mac-cyrillic", // + "x-mac-ukrainian" }); + createEncoding( // + "GBK", new String[] { // + "chinese", // + "csgb2312", // + "csiso58gb231280", // + "gb2312", // + "gb_2312", // + "gb_2312-80", // + "gbk", // + "iso-ir-58", // + "x-gbk" }); + createEncoding( // + "gb18030", new String[] { // + "gb18030" }); + createEncoding( // + "Big5", new String[] { // + "big5", // + "big5-hkscs", // + "cn-big5", // + "csbig5", // + "x-x-big5" }); + createEncoding( // + "EUC-JP", new String[] { // + "cseucpkdfmtjapanese", // + "euc-jp", // + "x-euc-jp" }); + createEncoding( // + "ISO-2022-JP", new String[] { // + "csiso2022jp", // + "iso-2022-jp" }); + createEncoding( // + "Shift_JIS", new String[] { // + "csshiftjis", // + "ms932", // + "ms_kanji", // + "shift-jis", // + "shift_jis", // + "sjis", // + "windows-31j", // + "x-sjis" }); + createEncoding( // + "EUC-KR", new String[] { // + "cseuckr", // + "csksc56011987", // + "euc-kr", // + "iso-ir-149", // + "korean", // + "ks_c_5601-1987", // + "ks_c_5601-1989", // + "ksc5601", // + "ksc_5601", // + "windows-949" }); + createEncoding( // + // Special case + "replacement", new String[] { // + "csiso2022kr", // + "hz-gb-2312", // + "iso-2022-cn", // + "iso-2022-cn-ext", // + "iso-2022-kr", // + "replacement" }); + createEncoding( // + "UTF-16BE", new String[] { // + "unicodefffe", // + "utf-16be" }); + createEncoding( // + "UTF-16LE", new String[] { // + "csunicode", // + "iso-10646-ucs-2", // + "ucs-2", // + "unicode", // + "unicodefeff", // + "utf-16", // + "utf-16le" }); + createEncoding( // + // Special case + "x-user-defined", new String[] { // + "x-user-defined" }); + } private final String canonName; private final Charset charset; static { - Set encodings = new HashSet(); - - SortedMap charsets = Charset.availableCharsets(); - for (Map.Entry entry : charsets.entrySet()) { - Charset cs = entry.getValue(); - String name = toNameKey(cs.name()); - String canonName = toAsciiLowerCase(cs.name()); - if (!isBanned(stripDashAndUnderscore(name))) { - name = name.intern(); - boolean asciiSuperset = asciiMapsToBasicLatin(testBuf, cs); - Encoding enc = new Encoding(canonName.intern(), cs, - asciiSuperset, isObscure(name), - isShouldNot(stripDashAndUnderscore(name)), - isLikelyEbcdic(name, asciiSuperset)); - encodings.add(enc); - Set aliases = cs.aliases(); - for (String alias : aliases) { - encodingByLabel.put(toNameKey(alias).intern(), enc); - } - } - } - // Overwrite possible overlapping aliases with the real things--just in - // case - for (Encoding encoding : encodings) { - encodingByLabel.put(toNameKey(encoding.getCanonName()), - encoding); - } UTF8 = forName("utf-8"); UTF16 = forName("utf-16"); UTF16BE = forName("utf-16be"); From 514edc91eb0fd0761b82d640e5d73cfcd8d2a4fb Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 12 Sep 2020 22:15:13 +0900 Subject: [PATCH 08/15] Drop Encoding.toAsciiLowerCase; use toLowerCase() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change drops the Encoding.toAsciiLowerCase() method and replaces calls to it with calls to string.toLowerCase() — because none of the conforming encoding names have any special characters for which the behavior of Encoding.toAsciiLowerCase() would produce any different results than string.toLowerCase() does. --- src/nu/validator/htmlparser/io/Driver.java | 4 +-- src/nu/validator/htmlparser/io/Encoding.java | 29 ------------------- .../validator/htmlparser/io/MetaSniffer.java | 2 +- 3 files changed, 3 insertions(+), 32 deletions(-) diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java index af546075..89bf816d 100644 --- a/src/nu/validator/htmlparser/io/Driver.java +++ b/src/nu/validator/htmlparser/io/Driver.java @@ -353,7 +353,7 @@ public void setEncoding(Encoding encoding, Confidence confidence) { public boolean internalEncodingDeclaration(String internalCharset) throws SAXException { try { - internalCharset = Encoding.toAsciiLowerCase(internalCharset); + internalCharset = internalCharset.toLowerCase(); Encoding cs; if ("utf-16".equals(internalCharset) || "utf-16be".equals(internalCharset) @@ -445,7 +445,7 @@ protected Encoding encodingFromExternalDeclaration(String encoding) if (encoding == null) { return null; } - encoding = Encoding.toAsciiLowerCase(encoding); + encoding = encoding.toLowerCase(); try { Encoding cs = Encoding.forName(encoding); if ("utf-16".equals(cs.getCanonName()) diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index 83695dc5..99a79075 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -425,35 +425,6 @@ public static String toNameKey(String str) { return new String(buf, 0, j); } - public static String stripDashAndUnderscore(String str) { - if (str == null) { - return null; - } - char[] buf = new char[str.length()]; - for (int i = 0; i < str.length(); i++) { - char c = str.charAt(i); - if (c == '-' || c == '_') { - buf[i] = c; - } - } - return new String(buf); - } - - public static String toAsciiLowerCase(String str) { - if (str == null) { - return null; - } - char[] buf = new char[str.length()]; - for (int i = 0; i < str.length(); i++) { - char c = str.charAt(i); - if (c >= 'A' && c <= 'Z') { - c += 0x20; - } - buf[i] = c; - } - return new String(buf); - } - /** * @param canonName * @param charset diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java index 600c1e72..032a0ae9 100755 --- a/src/nu/validator/htmlparser/io/MetaSniffer.java +++ b/src/nu/validator/htmlparser/io/MetaSniffer.java @@ -159,7 +159,7 @@ public String getEncoding() { } protected boolean tryCharset(String encoding) throws SAXException { - encoding = Encoding.toAsciiLowerCase(encoding); + encoding = encoding.toLowerCase(); try { // XXX spec says only UTF-16 if ("utf-16".equals(encoding) || "utf-16be".equals(encoding) || "utf-16le".equals(encoding) || "utf-32".equals(encoding) || "utf-32be".equals(encoding) || "utf-32le".equals(encoding)) { From 7fa4d801433b65c45e22458e567e16cdb583e238 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sat, 12 Sep 2020 23:59:12 +0900 Subject: [PATCH 09/15] Rename to whineAboutEncodingAndReturnCanonical() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change renames the whineAboutEncodingAndReturnActual() method to whineAboutEncodingAndReturnCanonical() — in order to make more clear it’s reporting that the canonical name of the encoding is preferred over any particular label for that encoding. --- src/nu/validator/htmlparser/io/Driver.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java index 89bf816d..c007451a 100644 --- a/src/nu/validator/htmlparser/io/Driver.java +++ b/src/nu/validator/htmlparser/io/Driver.java @@ -384,7 +384,7 @@ public boolean internalEncodingDeclaration(String internalCharset) + "\u201D disagrees with the actual encoding of the document (\u201C" + characterEncoding.getCanonName() + "\u201D)."); } else { - Encoding newEnc = whineAboutEncodingAndReturnActual( + Encoding newEnc = whineAboutEncodingAndReturnCanonical( internalCharset, cs); tokenizer.errTreeBuilder("Changing character encoding \u201C" + internalCharset + "\u201D and reparsing."); @@ -452,7 +452,7 @@ protected Encoding encodingFromExternalDeclaration(String encoding) || "utf-32".equals(cs.getCanonName())) { swallowBom = false; } - return whineAboutEncodingAndReturnActual(encoding, cs); + return whineAboutEncodingAndReturnCanonical(encoding, cs); } catch (UnsupportedCharsetException e) { tokenizer.err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will sniff."); @@ -467,7 +467,7 @@ protected Encoding encodingFromExternalDeclaration(String encoding) * @return * @throws SAXException */ - protected Encoding whineAboutEncodingAndReturnActual(String encoding, + protected Encoding whineAboutEncodingAndReturnCanonical(String encoding, Encoding cs) throws SAXException { String canonName = cs.getCanonName(); if (!canonName.equals(encoding)) { From edfaeff1d5e92d3dfb95e2ecffc11f192b9fae29 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sun, 13 Sep 2020 02:38:45 +0900 Subject: [PATCH 10/15] =?UTF-8?q?Drop=20UTF-32=20&=20=E2=80=9CUTF-16?= =?UTF-8?q?=E2=80=9D;=20use=20UTF-16BE=20and=20UTF-16LE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change drops all handling for UTF-32 (which is a completely invalid/ unsupported encoding per the Encoding spec), as well as replacing handling for “UTF-16” (which also isn’t a valid/supported encoding) with, instead, handling for the valid/supported encodings UTF-16BE and UTF-16LE. --- src/nu/validator/htmlparser/io/Driver.java | 9 ++++----- src/nu/validator/htmlparser/io/Encoding.java | 3 --- src/nu/validator/htmlparser/io/MetaSniffer.java | 3 +-- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java index c007451a..27d970d2 100644 --- a/src/nu/validator/htmlparser/io/Driver.java +++ b/src/nu/validator/htmlparser/io/Driver.java @@ -354,9 +354,8 @@ public boolean internalEncodingDeclaration(String internalCharset) throws SAXException { try { internalCharset = internalCharset.toLowerCase(); - Encoding cs; - if ("utf-16".equals(internalCharset) - || "utf-16be".equals(internalCharset) + Encoding cs = Encoding.forName(internalCharset); + if ("utf-16be".equals(internalCharset) || "utf-16le".equals(internalCharset)) { tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" + internalCharset @@ -448,8 +447,8 @@ protected Encoding encodingFromExternalDeclaration(String encoding) encoding = encoding.toLowerCase(); try { Encoding cs = Encoding.forName(encoding); - if ("utf-16".equals(cs.getCanonName()) - || "utf-32".equals(cs.getCanonName())) { + if ("utf-16be".equals(cs.getCanonName()) + || "utf-16le".equals(cs.getCanonName())) { swallowBom = false; } return whineAboutEncodingAndReturnCanonical(encoding, cs); diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index 99a79075..ded6afb4 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -44,8 +44,6 @@ public class Encoding { public static final Encoding UTF8; - public static final Encoding UTF16; - public static final Encoding UTF16LE; public static final Encoding UTF16BE; @@ -391,7 +389,6 @@ private static void createEncoding(String name, String[] labels) { static { UTF8 = forName("utf-8"); - UTF16 = forName("utf-16"); UTF16BE = forName("utf-16be"); UTF16LE = forName("utf-16le"); WINDOWS1252 = forName("windows-1252"); diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java index 032a0ae9..cc4bae27 100755 --- a/src/nu/validator/htmlparser/io/MetaSniffer.java +++ b/src/nu/validator/htmlparser/io/MetaSniffer.java @@ -161,8 +161,7 @@ public String getEncoding() { protected boolean tryCharset(String encoding) throws SAXException { encoding = encoding.toLowerCase(); try { - // XXX spec says only UTF-16 - if ("utf-16".equals(encoding) || "utf-16be".equals(encoding) || "utf-16le".equals(encoding) || "utf-32".equals(encoding) || "utf-32be".equals(encoding) || "utf-32le".equals(encoding)) { + if ("utf-16be".equals(encoding) || "utf-16le".equals(encoding)) { this.characterEncoding = Encoding.UTF8; err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead."); return true; From 0cfe6d230ad636a7022ebc51787549f2ac96fb8c Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sun, 13 Sep 2020 03:56:21 +0900 Subject: [PATCH 11/15] =?UTF-8?q?Sync=20w/=20=E2=80=9CChanging=20the=20enc?= =?UTF-8?q?oding=20while=20parsing=E2=80=9D=20algo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change brings the parser into conformance with current requirements in the “Changing the encoding while parsing” algorithm in the HTML spec. --- src/nu/validator/htmlparser/io/Driver.java | 112 +++++++++++++++------ 1 file changed, 80 insertions(+), 32 deletions(-) diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java index 27d970d2..b5df79b6 100644 --- a/src/nu/validator/htmlparser/io/Driver.java +++ b/src/nu/validator/htmlparser/io/Driver.java @@ -21,6 +21,14 @@ * DEALINGS IN THE SOFTWARE. */ +/* + * The comments following this one that use the same comment syntax as this + * comment are quotes from the HTML Standard at https://html.spec.whatwg.org/ + * as of 10 September 2020. That document came with this statement: + * Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is + * licensed under a Creative Commons Attribution 4.0 International License. + */ + package nu.validator.htmlparser.io; import java.io.IOException; @@ -214,9 +222,8 @@ public void tokenize(InputSource is, int bufferSize) tokenizer.getErrorHandler(), tokenizer, this, heuristics); } else { if (this.characterEncoding != Encoding.UTF8) { - errorWithoutLocation("Legacy encoding \u201C" - + this.characterEncoding.getCanonName() - + "\u201D used. Documents must use UTF-8."); + errorWithoutLocation(Encoding.msgLegacyEncoding( + this.characterEncoding.getCanonName())); } becomeConfident(); this.reader = new HtmlInputStreamReader(inputStream, @@ -350,50 +357,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) { } } + private void errInternalActualDiffer(String internalCharset, String actual) + throws SAXException { + if (!internalCharset.equals(actual)) { + tokenizer.errTreeBuilder( + "Ignoring internal encoding declaration \u201C" + + internalCharset + "\u201D, which disagrees with" + + " the actual encoding of the document (\u201C" + + actual + "\u201D)."); + } + } + public boolean internalEncodingDeclaration(String internalCharset) throws SAXException { + String actual = characterEncoding.getCanonName(); + if (confidence == Confidence.CERTAIN) { + errInternalActualDiffer(internalCharset, actual); + return true; + } + /* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */ try { + if ("utf-16be".equals(actual) || "utf-16le".equals(actual)) { + errInternalActualDiffer(internalCharset, actual); + /* + * 1. If the encoding that is already being used to interpret + * the input stream is a UTF-16 encoding, then set the + * confidence to certain and return. The new encoding is ignored + * becomeConfident(); + */ + return true; + } internalCharset = internalCharset.toLowerCase(); Encoding cs = Encoding.forName(internalCharset); if ("utf-16be".equals(internalCharset) || "utf-16le".equals(internalCharset)) { - tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" - + internalCharset - + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D."); + /* + * 2. If the new encoding is a UTF-16 encoding, then change it + * to UTF-8. + */ + tokenizer.errTreeBuilder( + Encoding.msgIgnoredCharset(internalCharset, "utf-8")); cs = Encoding.UTF8; internalCharset = "utf-8"; - } else { - cs = Encoding.forName(internalCharset); - } - Encoding actual = cs.getActualHtmlEncoding(); - if (actual == null) { - actual = cs; + } else if ("x-user-defined".equals(internalCharset)) { + /* + * 3. If the new encoding is x-user-defined, then change it to + * windows-1252. + */ + tokenizer.errTreeBuilder(Encoding.msgIgnoredCharset( + "x-user-defined", "windows-1252")); + cs = Encoding.WINDOWS1252; + internalCharset = "windows-1252"; } if (characterEncoding == null) { // Reader case return true; } - if (characterEncoding == actual) { + if (characterEncoding == cs) { + /* + * 4. If the new encoding is identical or equivalent to the + * encoding that is already being used to interpret the input + * stream, then set the confidence to certain and return. + */ becomeConfident(); return true; } - if (confidence == Confidence.CERTAIN && actual != characterEncoding) { - tokenizer.errTreeBuilder("Internal encoding declaration \u201C" - + internalCharset - + "\u201D disagrees with the actual encoding of the document (\u201C" - + characterEncoding.getCanonName() + "\u201D)."); - } else { - Encoding newEnc = whineAboutEncodingAndReturnCanonical( - internalCharset, cs); - tokenizer.errTreeBuilder("Changing character encoding \u201C" - + internalCharset + "\u201D and reparsing."); - characterEncoding = newEnc; - throw new ReparseException(); - } - return true; + /* + * 6. Otherwise, navigate to the document again, with + * historyHandling set to "replace", and using the same source + * browsing context, but this time skip the encoding sniffing + * algorithm and instead just set the encoding to the new encoding + */ + Encoding newEnc = whineAboutEncodingAndReturnCanonical( + internalCharset, cs); + tokenizer.errTreeBuilder("Changing character encoding to \u201C" + + internalCharset + "\u201D and reparsing."); + characterEncoding = newEnc; + // Note: We intentionally don’t call becomeConfident() at this + // point. If we did, it would end up causing the exception + // java.lang.IllegalStateException: rewind() after willNotRewind() + // to be thrown later. So we are departing here from strictly + // following the ordering in the corresponding spec language, which + // specifies setting the confidence to "certain" at this point. + throw new ReparseException(); } catch (UnsupportedCharsetException e) { - tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C" - + internalCharset + "\u201D."); + tokenizer.errTreeBuilder( + Encoding.msgBadInternalCharset(internalCharset)); return false; } } @@ -453,8 +502,7 @@ protected Encoding encodingFromExternalDeclaration(String encoding) } return whineAboutEncodingAndReturnCanonical(encoding, cs); } catch (UnsupportedCharsetException e) { - tokenizer.err("Unsupported character encoding name: \u201C" + encoding - + "\u201D. Will sniff."); + tokenizer.err(Encoding.msgBadEncoding(encoding) + " Will sniff."); swallowBom = true; } return null; // keep the compiler happy @@ -470,7 +518,7 @@ protected Encoding whineAboutEncodingAndReturnCanonical(String encoding, Encoding cs) throws SAXException { String canonName = cs.getCanonName(); if (!canonName.equals(encoding)) { - tokenizer.err(Encoding.msgNotPreferredName(encoding, canonName)); + tokenizer.err(Encoding.msgNotCanonicalName(encoding, canonName)); } return cs; } From 76e6d62526af5f9910c764c9329c622da985a0d7 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sun, 13 Sep 2020 04:06:52 +0900 Subject: [PATCH 12/15] Use shared messages in HtmlInputStreamReader This change just updates HtmlInputStreamReader to use shared message strings from the Encoding class for some of its error messages. --- .../htmlparser/io/HtmlInputStreamReader.java | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java index 4facce4a..45e0278f 100755 --- a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java +++ b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java @@ -139,9 +139,7 @@ public HtmlInputStreamReader(InputStream inputStream, if (encoding == null) { declared = false; } else if (encoding != Encoding.UTF8) { - err("Legacy encoding \u201C" - + encoding.getCanonName() - + "\u201D used. Documents must use UTF-8."); + err(Encoding.msgLegacyEncoding(encoding.getCanonName())); } if (encoding == null && (heuristics == Heuristics.CHARDET || heuristics == Heuristics.ALL)) { @@ -157,7 +155,8 @@ public HtmlInputStreamReader(InputStream inputStream, encoding = Encoding.WINDOWS1252; } if (!declared) { - err("The character encoding was not declared. Proceeding using \u201C" + encoding.getCanonName() + "\u201D."); + err("The character encoding was not declared. Proceeding using" + + " \u201C" + encoding.getCanonName() + "\u201D."); } if (driver != null) { driver.setEncoding(encoding, Confidence.TENTATIVE); @@ -168,9 +167,7 @@ public HtmlInputStreamReader(InputStream inputStream, driver.setEncoding(Encoding.UTF8, Confidence.CERTAIN); } } else { - err("Legacy encoding \u201C" - + encoding.getCanonName() - + "\u201D used. Documents must use UTF-8."); + err(Encoding.msgLegacyEncoding(encoding.getCanonName())); if (driver != null) { driver.setEncoding(Encoding.UTF16, Confidence.CERTAIN); } From 2e31f9c3189a0fe618d1e6977eded461577dc827 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sun, 13 Sep 2020 04:08:43 +0900 Subject: [PATCH 13/15] Use shared message strings in MetaSniffer This change just updates MetaSniffer to use shared message strings from the Encoding class for some of its error messages. --- src/nu/validator/htmlparser/io/MetaSniffer.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java index cc4bae27..4bbcf0cf 100755 --- a/src/nu/validator/htmlparser/io/MetaSniffer.java +++ b/src/nu/validator/htmlparser/io/MetaSniffer.java @@ -163,7 +163,7 @@ protected boolean tryCharset(String encoding) throws SAXException { try { if ("utf-16be".equals(encoding) || "utf-16le".equals(encoding)) { this.characterEncoding = Encoding.UTF8; - err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead."); + err(Encoding.msgIgnoredCharset(encoding, "utf-8")); return true; } else { Encoding cs = Encoding.forName(encoding); @@ -171,14 +171,12 @@ protected boolean tryCharset(String encoding) throws SAXException { if (!cs.getCanonName().equals(encoding)) { err(Encoding.msgNotCanonicalName(encoding, canonName)); this.characterEncoding = cs; - } else { - warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D."); - this.characterEncoding = actual; } return true; } } catch (UnsupportedCharsetException e) { - err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing."); + err(Encoding.msgBadInternalCharset(encoding) + + " Will continue sniffing."); } return false; } From 0de3e555e1d3504b22ea099cb71ab5c07aeb5900 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sun, 13 Sep 2020 04:10:48 +0900 Subject: [PATCH 14/15] Conform MetaSniffer "x-user-defined" handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change makes the parser’s meta-prescan code conform to the requirements in the spec for handling of the "x-user-defined" encoding. --- src/nu/validator/htmlparser/io/MetaSniffer.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java index 4bbcf0cf..60e157d0 100755 --- a/src/nu/validator/htmlparser/io/MetaSniffer.java +++ b/src/nu/validator/htmlparser/io/MetaSniffer.java @@ -165,6 +165,10 @@ protected boolean tryCharset(String encoding) throws SAXException { this.characterEncoding = Encoding.UTF8; err(Encoding.msgIgnoredCharset(encoding, "utf-8")); return true; + } else if ("x-user-defined".equals(encoding)) { + this.characterEncoding = Encoding.WINDOWS1252; + err(Encoding.msgIgnoredCharset("x-user-defined", "windows-1252")); + return true; } else { Encoding cs = Encoding.forName(encoding); String canonName = cs.getCanonName(); From c5a88d6d8b22e12c2bb6fb1c8a4409d7eba66b2b Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sun, 13 Sep 2020 04:14:19 +0900 Subject: [PATCH 15/15] Correct non-UTF8 handling in HtmlInputStreamReader This change corrects the code to set the right encoding in the case when the external encoding has been determined to be non-UTF8. --- src/nu/validator/htmlparser/io/HtmlInputStreamReader.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java index 45e0278f..c54e591a 100755 --- a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java +++ b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java @@ -169,7 +169,8 @@ public HtmlInputStreamReader(InputStream inputStream, } else { err(Encoding.msgLegacyEncoding(encoding.getCanonName())); if (driver != null) { - driver.setEncoding(Encoding.UTF16, Confidence.CERTAIN); + // XXX Why did we do driver.setEncoding(encoding.UTF16... ? + driver.setEncoding(encoding, Confidence.CERTAIN); } } }