diff --git a/src/nu/validator/htmlparser/extra/ChardetSniffer.java b/src/nu/validator/htmlparser/extra/ChardetSniffer.java index a7575039..4f6c5826 100644 --- a/src/nu/validator/htmlparser/extra/ChardetSniffer.java +++ b/src/nu/validator/htmlparser/extra/ChardetSniffer.java @@ -54,7 +54,9 @@ public Encoding sniff() throws IOException { detector.Init(this); detector.DoIt(source, length, false); detector.DataEnd(); - if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) { + if (returnValue != null && returnValue != Encoding.WINDOWS1252 + && returnValue != Encoding.UTF16BE + && returnValue != Encoding.UTF16LE) { return returnValue; } else { return null; @@ -72,10 +74,6 @@ public static void main(String[] args) { public void Notify(String charsetName) { try { Encoding enc = Encoding.forName(charsetName); - Encoding actual = enc.getActualHtmlEncoding(); - if (actual != null) { - enc = actual; - } returnValue = enc; } catch (UnsupportedCharsetException e) { returnValue = null; diff --git a/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java b/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java index f3caab5c..7aa0dde0 100644 --- a/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java +++ b/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java @@ -53,11 +53,8 @@ public Encoding sniff() throws IOException { detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); - Encoding actual = enc.getActualHtmlEncoding(); - if (actual != null) { - enc = actual; - } - if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { + if (enc != Encoding.WINDOWS1252 // + && enc != Encoding.UTF16BE && enc != Encoding.UTF16LE) { return enc; } else { return null; diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java index a8dd387c..b5df79b6 100644 --- a/src/nu/validator/htmlparser/io/Driver.java +++ b/src/nu/validator/htmlparser/io/Driver.java @@ -21,6 +21,14 @@ * DEALINGS IN THE SOFTWARE. */ +/* + * The comments following this one that use the same comment syntax as this + * comment are quotes from the HTML Standard at https://html.spec.whatwg.org/ + * as of 10 September 2020. That document came with this statement: + * Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is + * licensed under a Creative Commons Attribution 4.0 International License. + */ + package nu.validator.htmlparser.io; import java.io.IOException; @@ -214,9 +222,8 @@ public void tokenize(InputSource is, int bufferSize) tokenizer.getErrorHandler(), tokenizer, this, heuristics); } else { if (this.characterEncoding != Encoding.UTF8) { - errorWithoutLocation("Legacy encoding \u201C" - + this.characterEncoding.getCanonName() - + "\u201D used. Documents must use UTF-8."); + errorWithoutLocation(Encoding.msgLegacyEncoding( + this.characterEncoding.getCanonName())); } becomeConfident(); this.reader = new HtmlInputStreamReader(inputStream, @@ -350,57 +357,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) { } } + private void errInternalActualDiffer(String internalCharset, String actual) + throws SAXException { + if (!internalCharset.equals(actual)) { + tokenizer.errTreeBuilder( + "Ignoring internal encoding declaration \u201C" + + internalCharset + "\u201D, which disagrees with" + + " the actual encoding of the document (\u201C" + + actual + "\u201D)."); + } + } + public boolean internalEncodingDeclaration(String internalCharset) throws SAXException { + String actual = characterEncoding.getCanonName(); + if (confidence == Confidence.CERTAIN) { + errInternalActualDiffer(internalCharset, actual); + return true; + } + /* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */ try { - internalCharset = Encoding.toAsciiLowerCase(internalCharset); - Encoding cs; - if ("utf-16".equals(internalCharset) - || "utf-16be".equals(internalCharset) + if ("utf-16be".equals(actual) || "utf-16le".equals(actual)) { + errInternalActualDiffer(internalCharset, actual); + /* + * 1. If the encoding that is already being used to interpret + * the input stream is a UTF-16 encoding, then set the + * confidence to certain and return. The new encoding is ignored + * becomeConfident(); + */ + return true; + } + internalCharset = internalCharset.toLowerCase(); + Encoding cs = Encoding.forName(internalCharset); + if ("utf-16be".equals(internalCharset) || "utf-16le".equals(internalCharset)) { - tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" - + internalCharset - + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D."); + /* + * 2. If the new encoding is a UTF-16 encoding, then change it + * to UTF-8. + */ + tokenizer.errTreeBuilder( + Encoding.msgIgnoredCharset(internalCharset, "utf-8")); cs = Encoding.UTF8; internalCharset = "utf-8"; - } else { - cs = Encoding.forName(internalCharset); - } - Encoding actual = cs.getActualHtmlEncoding(); - if (actual == null) { - actual = cs; - } - if (!actual.isAsciiSuperset()) { - tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" - + internalCharset - + "\u201D which is not an ASCII superset. Not changing the encoding."); - return false; + } else if ("x-user-defined".equals(internalCharset)) { + /* + * 3. If the new encoding is x-user-defined, then change it to + * windows-1252. + */ + tokenizer.errTreeBuilder(Encoding.msgIgnoredCharset( + "x-user-defined", "windows-1252")); + cs = Encoding.WINDOWS1252; + internalCharset = "windows-1252"; } if (characterEncoding == null) { // Reader case return true; } - if (characterEncoding == actual) { + if (characterEncoding == cs) { + /* + * 4. If the new encoding is identical or equivalent to the + * encoding that is already being used to interpret the input + * stream, then set the confidence to certain and return. + */ becomeConfident(); return true; } - if (confidence == Confidence.CERTAIN && actual != characterEncoding) { - tokenizer.errTreeBuilder("Internal encoding declaration \u201C" - + internalCharset - + "\u201D disagrees with the actual encoding of the document (\u201C" - + characterEncoding.getCanonName() + "\u201D)."); - } else { - Encoding newEnc = whineAboutEncodingAndReturnActual( - internalCharset, cs); - tokenizer.errTreeBuilder("Changing character encoding \u201C" - + internalCharset + "\u201D and reparsing."); - characterEncoding = newEnc; - throw new ReparseException(); - } - return true; + /* + * 6. Otherwise, navigate to the document again, with + * historyHandling set to "replace", and using the same source + * browsing context, but this time skip the encoding sniffing + * algorithm and instead just set the encoding to the new encoding + */ + Encoding newEnc = whineAboutEncodingAndReturnCanonical( + internalCharset, cs); + tokenizer.errTreeBuilder("Changing character encoding to \u201C" + + internalCharset + "\u201D and reparsing."); + characterEncoding = newEnc; + // Note: We intentionally don’t call becomeConfident() at this + // point. If we did, it would end up causing the exception + // java.lang.IllegalStateException: rewind() after willNotRewind() + // to be thrown later. So we are departing here from strictly + // following the ordering in the corresponding spec language, which + // specifies setting the confidence to "certain" at this point. + throw new ReparseException(); } catch (UnsupportedCharsetException e) { - tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C" - + internalCharset + "\u201D."); + tokenizer.errTreeBuilder( + Encoding.msgBadInternalCharset(internalCharset)); return false; } } @@ -451,17 +493,16 @@ protected Encoding encodingFromExternalDeclaration(String encoding) if (encoding == null) { return null; } - encoding = Encoding.toAsciiLowerCase(encoding); + encoding = encoding.toLowerCase(); try { Encoding cs = Encoding.forName(encoding); - if ("utf-16".equals(cs.getCanonName()) - || "utf-32".equals(cs.getCanonName())) { + if ("utf-16be".equals(cs.getCanonName()) + || "utf-16le".equals(cs.getCanonName())) { swallowBom = false; } - return whineAboutEncodingAndReturnActual(encoding, cs); + return whineAboutEncodingAndReturnCanonical(encoding, cs); } catch (UnsupportedCharsetException e) { - tokenizer.err("Unsupported character encoding name: \u201C" + encoding - + "\u201D. Will sniff."); + tokenizer.err(Encoding.msgBadEncoding(encoding) + " Will sniff."); swallowBom = true; } return null; // keep the compiler happy @@ -473,45 +514,13 @@ protected Encoding encodingFromExternalDeclaration(String encoding) * @return * @throws SAXException */ - protected Encoding whineAboutEncodingAndReturnActual(String encoding, + protected Encoding whineAboutEncodingAndReturnCanonical(String encoding, Encoding cs) throws SAXException { String canonName = cs.getCanonName(); - if (!cs.isRegistered()) { - if (encoding.startsWith("x-")) { - tokenizer.err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding. (Charmod C022)"); - } else { - tokenizer.err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)"); - } - } else if (!canonName.equals(encoding)) { - tokenizer.err("The encoding \u201C" - + encoding - + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" - + canonName + "\u201D. (Charmod C024)"); - } - if (cs.isShouldNot()) { - tokenizer.warn("Authors should not use the character encoding \u201C" - + encoding - + "\u201D. It is recommended to use \u201CUTF-8\u201D."); - } else if (cs.isLikelyEbcdic()) { - tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D."); - } else if (cs.isObscure()) { - tokenizer.warn("The character encoding \u201C" - + encoding - + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); - } - Encoding actual = cs.getActualHtmlEncoding(); - if (actual == null) { - return cs; - } else { - tokenizer.warn("Using \u201C" + actual.getCanonName() - + "\u201D instead of the declared encoding \u201C" - + encoding + "\u201D."); - return actual; + if (!canonName.equals(encoding)) { + tokenizer.err(Encoding.msgNotCanonicalName(encoding, canonName)); } + return cs; } private class ReparseException extends SAXException { diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java index 123465f8..ded6afb4 100644 --- a/src/nu/validator/htmlparser/io/Encoding.java +++ b/src/nu/validator/htmlparser/io/Encoding.java @@ -44,422 +44,358 @@ public class Encoding { public static final Encoding UTF8; - public static final Encoding UTF16; - public static final Encoding UTF16LE; public static final Encoding UTF16BE; public static final Encoding WINDOWS1252; - private static String[] SHOULD_NOT = { "jisx02121990", "xjis0208" }; + private static Map encodingByLabel = + new HashMap(); - private static String[] BANNED = { "bocu1", "cesu8", "compoundtext", - "iscii91", "macarabic", "maccentraleurroman", "maccroatian", - "maccyrillic", "macdevanagari", "macfarsi", "macgreek", - "macgujarati", "macgurmukhi", "machebrew", "macicelandic", - "macroman", "macromanian", "macthai", "macturkish", "macukranian", - "scsu", "utf32", "utf32be", "utf32le", "utf7", "ximapmailboxname", - "xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom", - "xutf32lebom", "xutf16oppositeendian", "xutf16platformendian", - "xutf32oppositeendian", "xutf32platformendian" }; + private static void createEncoding(String name, String[] labels) { + if (!Charset.isSupported(name)) { + return; + } + Charset cs = Charset.forName(name); + Encoding enc = new Encoding(name.toLowerCase().intern(), cs); + for (String label : labels) { + encodingByLabel.put(label, enc); + } + } - /* From the table at https://encoding.spec.whatwg.org/#names-and-labels, - * everything in the Labels column, sorted */ - private static String[] NOT_OBSCURE = { // - "866", // - "ansi_x3.4-1968", // - "arabic", // - "ascii", // - "asmo-708", // - "big5", // - "big5-hkscs", // - "chinese", // - "cn-big5", // - "cp1250", // - "cp1251", // - "cp1252", // - "cp1253", // - "cp1254", // - "cp1255", // - "cp1256", // - "cp1257", // - "cp1258", // - "cp819", // - "cp866", // - "csbig5", // - "cseuckr", // - "cseucpkdfmtjapanese", // - "csgb2312", // - "csibm866", // - "csiso2022jp", // - "csiso2022kr", // - "csiso58gb231280", // - "csiso88596e", // - "csiso88596i", // - "csiso88598e", // - "csiso88598i", // - "csisolatin1", // - "csisolatin2", // - "csisolatin3", // - "csisolatin4", // - "csisolatin5", // - "csisolatin6", // - "csisolatin9", // - "csisolatinarabic", // - "csisolatincyrillic", // - "csisolatingreek", // - "csisolatinhebrew", // - "cskoi8r", // - "csksc56011987", // - "csmacintosh", // - "csshiftjis", // - "csunicode", // - "cyrillic", // - "dos-874", // - "ecma-114", // - "ecma-118", // - "elot_928", // - "euc-jp", // - "euc-kr", // - "gb18030", // - "gb2312", // - "gb_2312", // - "gb_2312-80", // - "gbk", // - "greek", // - "greek8", // - "hebrew", // - "hz-gb-2312", // - "ibm819", // - "ibm866", // - "iso-10646-ucs-2", // - "iso-2022-cn", // - "iso-2022-cn-ext", // - "iso-2022-jp", // - "iso-2022-kr", // - "iso-8859-1", // - "iso-8859-10", // - "iso-8859-11", // - "iso-8859-13", // - "iso-8859-14", // - "iso-8859-15", // - "iso-8859-16", // - "iso-8859-2", // - "iso-8859-3", // - "iso-8859-4", // - "iso-8859-5", // - "iso-8859-6", // - "iso-8859-6-e", // - "iso-8859-6-i", // - "iso-8859-7", // - "iso-8859-8", // - "iso-8859-8-e", // - "iso-8859-8-i", // - "iso-8859-9", // - "iso-ir-100", // - "iso-ir-101", // - "iso-ir-109", // - "iso-ir-110", // - "iso-ir-126", // - "iso-ir-127", // - "iso-ir-138", // - "iso-ir-144", // - "iso-ir-148", // - "iso-ir-149", // - "iso-ir-157", // - "iso-ir-58", // - "iso8859-1", // - "iso8859-10", // - "iso8859-11", // - "iso8859-13", // - "iso8859-14", // - "iso8859-15", // - "iso8859-2", // - "iso8859-3", // - "iso8859-4", // - "iso8859-5", // - "iso8859-6", // - "iso8859-7", // - "iso8859-8", // - "iso8859-9", // - "iso88591", // - "iso885910", // - "iso885911", // - "iso885913", // - "iso885914", // - "iso885915", // - "iso88592", // - "iso88593", // - "iso88594", // - "iso88595", // - "iso88596", // - "iso88597", // - "iso88598", // - "iso88599", // - "iso_8859-1", // - "iso_8859-15", // - "iso_8859-1:1987", // - "iso_8859-2", // - "iso_8859-2:1987", // - "iso_8859-3", // - "iso_8859-3:1988", // - "iso_8859-4", // - "iso_8859-4:1988", // - "iso_8859-5", // - "iso_8859-5:1988", // - "iso_8859-6", // - "iso_8859-6:1987", // - "iso_8859-7", // - "iso_8859-7:1987", // - "iso_8859-8", // - "iso_8859-8:1988", // - "iso_8859-9", // - "iso_8859-9:1989", // - "koi", // - "koi8", // - "koi8-r", // - "koi8-ru", // - "koi8-u", // - "koi8_r", // - "korean", // - "ks_c_5601-1987", // - "ks_c_5601-1989", // - "ksc5601", // - "ksc_5601", // - "l1", // - "l2", // - "l3", // - "l4", // - "l5", // - "l6", // - "l9", // - "latin1", // - "latin2", // - "latin3", // - "latin4", // - "latin5", // - "latin6", // - "logical", // - "mac", // - "macintosh", // - "ms932", // - "ms_kanji", // - "replacement", // - "shift-jis", // - "shift_jis", // - "sjis", // - "sun_eu_greek", // - "tis-620", // - "ucs-2", // - "unicode", // - "unicode-1-1-utf-8", // - "unicode11utf8", // - "unicode20utf8", // - "unicodefeff", // - "unicodefffe", // - "us-ascii", // - "utf-16", // - "utf-16be", // - "utf-16le", // - "utf-8", // - "utf8", // - "visual", // - "windows-1250", // - "windows-1251", // - "windows-1252", // - "windows-1253", // - "windows-1254", // - "windows-1255", // - "windows-1256", // - "windows-1257", // - "windows-1258", // - "windows-31j", // - "windows-874", // - "windows-949", // - "x-cp1250", // - "x-cp1251", // - "x-cp1252", // - "x-cp1253", // - "x-cp1254", // - "x-cp1255", // - "x-cp1256", // - "x-cp1257", // - "x-cp1258", // - "x-euc-jp", // - "x-gbk", // - "x-mac-cyrillic", // - "x-mac-roman", // - "x-mac-ukrainian", // - "x-sjis", // - "x-unicode20utf8", // - "x-user-defined", // - "x-x-big5", // - }; - private static Map encodingByCookedName = new HashMap(); + static { + /* See https://encoding.spec.whatwg.org/#names-and-labels */ + createEncoding( // + "UTF-8", new String[] { // + "unicode-1-1-utf-8", // + "unicode11utf8", // + "unicode20utf8", // + "utf-8", // + "utf8", // + "x-unicode20utf8" }); + createEncoding( // + "IBM866", new String[] { // + "866", // + "cp866", // + "csibm866", // + "ibm866" }); + createEncoding( // + "ISO-8859-2", new String[] { // + "csisolatin2", // + "iso-8859-2", // + "iso-ir-101", // + "iso8859-2", // + "iso88592", // + "iso_8859-2", // + "iso_8859-2:1987", // + "l2", // + "latin2" }); + createEncoding( // + "ISO-8859-3", new String[] { // + "csisolatin3", // + "iso-8859-3", // + "iso-ir-109", // + "iso8859-3", // + "iso88593", // + "iso_8859-3", // + "iso_8859-3:1988", // + "l3", // + "latin3" }); + createEncoding( // + "ISO-8859-4", new String[] { // + "csisolatin4", // + "iso-8859-4", // + "iso-ir-110", // + "iso8859-4", // + "iso88594", // + "iso_8859-4", // + "iso_8859-4:1988", // + "l4", // + "latin4" }); + createEncoding( // + "ISO-8859-5", new String[] { // + "csisolatincyrillic", // + "cyrillic", // + "iso-8859-5", // + "iso-ir-144", // + "iso8859-5", // + "iso88595", // + "iso_8859-5", // + "iso_8859-5:1988" }); + createEncoding( // + "ISO-8859-6", new String[] { // + "arabic", // + "asmo-708", // + "csiso88596e", // + "csiso88596i", // + "csisolatinarabic", // + "ecma-114", // + "iso-8859-6", // + "iso-8859-6-e", // + "iso-8859-6-i", // + "iso-ir-127", // + "iso8859-6", // + "iso88596", // + "iso_8859-6", // + "iso_8859-6:1987" }); + createEncoding( // + "ISO-8859-7", new String[] { // + "csisolatingreek", // + "ecma-118", // + "elot_928", // + "greek", // + "greek8", // + "iso-8859-7", // + "iso-ir-126", // + "iso8859-7", // + "iso88597", // + "iso_8859-7", // + "iso_8859-7:1987", // + "sun_eu_greek" }); + createEncoding( // + "ISO-8859-8", new String[] { // + "csiso88598e", // + "csisolatinhebrew", // + "hebrew", // + "iso-8859-8", // + "iso-8859-8-e", // + "iso-ir-138", // + "iso8859-8", // + "iso88598", // + "iso_8859-8", // + "iso_8859-8:1988", // + "visual" }); + createEncoding( // + // Unsupported in Java + "ISO-8859-8-I", new String[] { // + "csiso88598i", // + "iso-8859-8-i", // + "logical" }); + createEncoding( // + // Unsupported in Java + "ISO-8859-10", new String[] { // + "csisolatin6", // + "iso-8859-10", // + "iso-ir-157", // + "iso8859-10", // + "iso885910", // + "l6", // + "latin6" }); + createEncoding( // + "ISO-8859-13", new String[] { // + "iso-8859-13", // + "iso8859-13", // + "iso885913" }); + createEncoding( // + // Unsupported in Java + "ISO-8859-14", new String[] { // + "iso-8859-14", // + "iso8859-14", // + "iso885914" }); + createEncoding( // + "ISO-8859-15", new String[] { // + "csisolatin9", // + "iso-8859-15", // + "iso8859-15", // + "iso885915", // + "iso_8859-15", // + "l9" }); + createEncoding( // + "ISO-8859-16", new String[] { // + "iso-8859-16" }); + createEncoding( // + "KOI8-R", new String[] { // + "cskoi8r", // + "koi", // + "koi8", // + "koi8-r", // + "koi8_r" }); + createEncoding( // + "KOI8-U", new String[] { // + "koi8-ru", // + "koi8-u" }); + createEncoding( // + // Unsupported in Java + "macintosh", new String[] { // + "csmacintosh", // + "mac", // + "macintosh", // + "x-mac-roman" }); + createEncoding( // + "windows-874", new String[] { // + "dos-874", // + "iso-8859-11", // + "iso8859-11", // + "iso885911", // + "tis-620", // + "windows-874" }); + createEncoding( // + "windows-1250", new String[] { // + "cp1250", // + "windows-1250", // + "x-cp1250" }); + createEncoding( // + "windows-1251", new String[] { // + "cp1251", // + "windows-1251", // + "x-cp1251" }); + createEncoding( // + "windows-1252", new String[] { // + "ansi_x3.4-1968", // + "ascii", // + "cp1252", // + "cp819", // + "csisolatin1", // + "ibm819", // + "iso-8859-1", // + "iso-ir-100", // + "iso8859-1", // + "iso88591", // + "iso_8859-1", // + "iso_8859-1:1987", // + "l1", // + "latin1", // + "us-ascii", // + "windows-1252", // + "x-cp1252" }); + createEncoding( // + "windows-1253", new String[] { // + "cp1253", // + "windows-1253", // + "x-cp1253" }); + createEncoding( // + "windows-1254", new String[] { // + "cp1254", // + "csisolatin5", // + "iso-8859-9", // + "iso-ir-148", // + "iso8859-9", // + "iso88599", // + "iso_8859-9", // + "iso_8859-9:1989", // + "l5", // + "latin5", // + "windows-1254", // + "x-cp1254" }); + createEncoding( // + "windows-1255", new String[] { // + "cp1255", // + "windows-1255", // + "x-cp1255" }); + createEncoding( // + "windows-1256", new String[] { // + "cp1256", // + "windows-1256", // + "x-cp1256" }); + createEncoding( // + "windows-1257", new String[] { // + "cp1257", // + "windows-1257", // + "x-cp1257" }); + createEncoding( // + "windows-1258", new String[] { // + "cp1258", // + "windows-1258", // + "x-cp1258" }); + createEncoding( // + // Unsupported in Java + "x-mac-cyrillic", new String[] { // + "x-mac-cyrillic", // + "x-mac-ukrainian" }); + createEncoding( // + "GBK", new String[] { // + "chinese", // + "csgb2312", // + "csiso58gb231280", // + "gb2312", // + "gb_2312", // + "gb_2312-80", // + "gbk", // + "iso-ir-58", // + "x-gbk" }); + createEncoding( // + "gb18030", new String[] { // + "gb18030" }); + createEncoding( // + "Big5", new String[] { // + "big5", // + "big5-hkscs", // + "cn-big5", // + "csbig5", // + "x-x-big5" }); + createEncoding( // + "EUC-JP", new String[] { // + "cseucpkdfmtjapanese", // + "euc-jp", // + "x-euc-jp" }); + createEncoding( // + "ISO-2022-JP", new String[] { // + "csiso2022jp", // + "iso-2022-jp" }); + createEncoding( // + "Shift_JIS", new String[] { // + "csshiftjis", // + "ms932", // + "ms_kanji", // + "shift-jis", // + "shift_jis", // + "sjis", // + "windows-31j", // + "x-sjis" }); + createEncoding( // + "EUC-KR", new String[] { // + "cseuckr", // + "csksc56011987", // + "euc-kr", // + "iso-ir-149", // + "korean", // + "ks_c_5601-1987", // + "ks_c_5601-1989", // + "ksc5601", // + "ksc_5601", // + "windows-949" }); + createEncoding( // + // Special case + "replacement", new String[] { // + "csiso2022kr", // + "hz-gb-2312", // + "iso-2022-cn", // + "iso-2022-cn-ext", // + "iso-2022-kr", // + "replacement" }); + createEncoding( // + "UTF-16BE", new String[] { // + "unicodefffe", // + "utf-16be" }); + createEncoding( // + "UTF-16LE", new String[] { // + "csunicode", // + "iso-10646-ucs-2", // + "ucs-2", // + "unicode", // + "unicodefeff", // + "utf-16", // + "utf-16le" }); + createEncoding( // + // Special case + "x-user-defined", new String[] { // + "x-user-defined" }); + } private final String canonName; private final Charset charset; - private final boolean asciiSuperset; - - private final boolean obscure; - - private final boolean shouldNot; - - private final boolean likelyEbcdic; - - private Encoding actualHtmlEncoding = null; - static { - byte[] testBuf = new byte[0x7F]; - for (int i = 0; i < 0x7F; i++) { - if (isAsciiSupersetnessSensitive(i)) { - testBuf[i] = (byte) i; - } else { - testBuf[i] = (byte) 0x20; - } - } - - Set encodings = new HashSet(); - - SortedMap charsets = Charset.availableCharsets(); - for (Map.Entry entry : charsets.entrySet()) { - Charset cs = entry.getValue(); - String name = toNameKey(cs.name()); - String canonName = toAsciiLowerCase(cs.name()); - if (!isBanned(stripDashAndUnderscore(name))) { - name = name.intern(); - boolean asciiSuperset = asciiMapsToBasicLatin(testBuf, cs); - Encoding enc = new Encoding(canonName.intern(), cs, - asciiSuperset, isObscure(name), - isShouldNot(stripDashAndUnderscore(name)), - isLikelyEbcdic(name, asciiSuperset)); - encodings.add(enc); - Set aliases = cs.aliases(); - for (String alias : aliases) { - encodingByCookedName.put(toNameKey(alias).intern(), enc); - } - } - } - // Overwrite possible overlapping aliases with the real things--just in - // case - for (Encoding encoding : encodings) { - encodingByCookedName.put(toNameKey(encoding.getCanonName()), - encoding); - } UTF8 = forName("utf-8"); - UTF16 = forName("utf-16"); UTF16BE = forName("utf-16be"); UTF16LE = forName("utf-16le"); WINDOWS1252 = forName("windows-1252"); - try { - forName("iso-8859-1").actualHtmlEncoding = forName("windows-1252"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("iso-8859-9").actualHtmlEncoding = forName("windows-1254"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("iso-8859-11").actualHtmlEncoding = forName("windows-874"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("x-iso-8859-11").actualHtmlEncoding = forName("windows-874"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("tis-620").actualHtmlEncoding = forName("windows-874"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("gb_2312-80").actualHtmlEncoding = forName("gbk"); - } catch (UnsupportedCharsetException e) { - } - try { - forName("gb2312").actualHtmlEncoding = forName("gbk"); - } catch (UnsupportedCharsetException e) { - } - try { - encodingByCookedName.put("x-x-big5", forName("big5")); - } catch (UnsupportedCharsetException e) { - } - try { - encodingByCookedName.put("euc-kr", forName("windows-949")); - } catch (UnsupportedCharsetException e) { - } - try { - encodingByCookedName.put("ks_c_5601-1987", forName("windows-949")); - } catch (UnsupportedCharsetException e) { - } - } - - private static boolean isAsciiSupersetnessSensitive(int c) { - return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22) - || (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F) - || (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A); - } - - private static boolean isObscure(String lowerCasePreferredIanaName) { - return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1); - } - - private static boolean isBanned(String lowerCasePreferredIanaName) { - if (lowerCasePreferredIanaName.startsWith("xibm")) { - return true; - } - return (Arrays.binarySearch(BANNED, lowerCasePreferredIanaName) > -1); - } - - private static boolean isShouldNot(String lowerCasePreferredIanaName) { - return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1); - } - - /** - * @param testBuf - * @param cs - */ - private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) { - CharsetDecoder dec = cs.newDecoder(); - dec.onMalformedInput(CodingErrorAction.REPORT); - dec.onUnmappableCharacter(CodingErrorAction.REPORT); - Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec); - try { - for (int i = 0; i < 0x7F; i++) { - if (isAsciiSupersetnessSensitive(i)) { - if (r.read() != i) { - return false; - } - } else { - if (r.read() != 0x20) { - return false; - } - } - } - } catch (IOException e) { - return false; - } catch (Exception e) { - return false; - } catch (CoderMalfunctionError e) { - return false; - } - - return true; - } - - private static boolean isLikelyEbcdic(String canonName, - boolean asciiSuperset) { - if (!asciiSuperset) { - return (canonName.startsWith("cp") || canonName.startsWith("ibm") || canonName.startsWith("xibm")); - } else { - return false; - } } public static Encoding forName(String name) { - Encoding rv = encodingByCookedName.get(toNameKey(name)); + Encoding rv = encodingByLabel.get(toNameKey(name)); if (rv == null) { throw new UnsupportedCharsetException(name); } else { @@ -486,61 +422,13 @@ public static String toNameKey(String str) { return new String(buf, 0, j); } - public static String stripDashAndUnderscore(String str) { - if (str == null) { - return null; - } - char[] buf = new char[str.length()]; - for (int i = 0; i < str.length(); i++) { - char c = str.charAt(i); - if (c == '-' || c == '_') { - buf[i] = c; - } - } - return new String(buf); - } - - public static String toAsciiLowerCase(String str) { - if (str == null) { - return null; - } - char[] buf = new char[str.length()]; - for (int i = 0; i < str.length(); i++) { - char c = str.charAt(i); - if (c >= 'A' && c <= 'Z') { - c += 0x20; - } - buf[i] = c; - } - return new String(buf); - } - /** * @param canonName * @param charset - * @param asciiSuperset - * @param obscure - * @param shouldNot - * @param likelyEbcdic */ - private Encoding(final String canonName, final Charset charset, - final boolean asciiSuperset, final boolean obscure, - final boolean shouldNot, final boolean likelyEbcdic) { + private Encoding(final String canonName, final Charset charset) { this.canonName = canonName; this.charset = charset; - this.asciiSuperset = asciiSuperset; - this.obscure = obscure; - this.shouldNot = shouldNot; - this.likelyEbcdic = likelyEbcdic; - } - - /** - * Returns the asciiSuperset. - * - * @return the asciiSuperset - */ - public boolean isAsciiSuperset() { - return asciiSuperset; } /** @@ -552,37 +440,6 @@ public String getCanonName() { return canonName; } - /** - * Returns the likelyEbcdic. - * - * @return the likelyEbcdic - */ - public boolean isLikelyEbcdic() { - return likelyEbcdic; - } - - /** - * Returns the obscure. - * - * @return the obscure - */ - public boolean isObscure() { - return obscure; - } - - /** - * Returns the shouldNot. - * - * @return the shouldNot - */ - public boolean isShouldNot() { - return shouldNot; - } - - public boolean isRegistered() { - return !canonName.startsWith("x-"); - } - /** * @return * @see java.nio.charset.Charset#canEncode() @@ -607,24 +464,36 @@ public CharsetEncoder newEncoder() { return charset.newEncoder(); } - /** - * Returns the actualHtmlEncoding. - * - * @return the actualHtmlEncoding - */ - public Encoding getActualHtmlEncoding() { - return actualHtmlEncoding; + protected static String msgLegacyEncoding(String name) { + return "Legacy encoding \u201C" + name + "\u201D used. Documents must" + + " use UTF-8."; + } + + protected static String msgIgnoredCharset(String ignored, String name) { + return "Internal encoding declaration specified \u201C" + ignored + + "\u201D. Continuing as if the encoding had been \u201C" + + name + "\u201D."; + } + protected static String msgNotCanonicalName(String label, String name) { + return "The encoding \u201C" + label + "\u201D is not the canonical" + + " name of the character encoding in use. The canonical name" + + " is \u201C" + name + "\u201D. (Charmod C024)"; + } + + protected static String msgBadInternalCharset(String internalCharset) { + return "Internal encoding declaration named an unsupported character" + + " encoding \u201C" + internalCharset + "\u201D."; + } + + protected static String msgBadEncoding(String name) { + return "Unsupported character encoding name: \u201C" + name + "\u201D."; } public static void main(String[] args) { - for (Map.Entry entry : encodingByCookedName.entrySet()) { + for (Map.Entry entry : encodingByLabel.entrySet()) { String name = entry.getKey(); Encoding enc = entry.getValue(); - System.out.printf( - "%21s: canon %21s, obs %5s, reg %5s, asc %5s, ebc %5s\n", - name, enc.getCanonName(), enc.isObscure(), - enc.isRegistered(), enc.isAsciiSuperset(), - enc.isLikelyEbcdic()); + System.out.printf("%21s: canon %13s\n", name, enc.getCanonName()); } } diff --git a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java index 4facce4a..c54e591a 100755 --- a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java +++ b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java @@ -139,9 +139,7 @@ public HtmlInputStreamReader(InputStream inputStream, if (encoding == null) { declared = false; } else if (encoding != Encoding.UTF8) { - err("Legacy encoding \u201C" - + encoding.getCanonName() - + "\u201D used. Documents must use UTF-8."); + err(Encoding.msgLegacyEncoding(encoding.getCanonName())); } if (encoding == null && (heuristics == Heuristics.CHARDET || heuristics == Heuristics.ALL)) { @@ -157,7 +155,8 @@ public HtmlInputStreamReader(InputStream inputStream, encoding = Encoding.WINDOWS1252; } if (!declared) { - err("The character encoding was not declared. Proceeding using \u201C" + encoding.getCanonName() + "\u201D."); + err("The character encoding was not declared. Proceeding using" + + " \u201C" + encoding.getCanonName() + "\u201D."); } if (driver != null) { driver.setEncoding(encoding, Confidence.TENTATIVE); @@ -168,11 +167,10 @@ public HtmlInputStreamReader(InputStream inputStream, driver.setEncoding(Encoding.UTF8, Confidence.CERTAIN); } } else { - err("Legacy encoding \u201C" - + encoding.getCanonName() - + "\u201D used. Documents must use UTF-8."); + err(Encoding.msgLegacyEncoding(encoding.getCanonName())); if (driver != null) { - driver.setEncoding(Encoding.UTF16, Confidence.CERTAIN); + // XXX Why did we do driver.setEncoding(encoding.UTF16... ? + driver.setEncoding(encoding, Confidence.CERTAIN); } } } diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java index 9deaef7a..60e157d0 100755 --- a/src/nu/validator/htmlparser/io/MetaSniffer.java +++ b/src/nu/validator/htmlparser/io/MetaSniffer.java @@ -159,55 +159,28 @@ public String getEncoding() { } protected boolean tryCharset(String encoding) throws SAXException { - encoding = Encoding.toAsciiLowerCase(encoding); + encoding = encoding.toLowerCase(); try { - // XXX spec says only UTF-16 - if ("utf-16".equals(encoding) || "utf-16be".equals(encoding) || "utf-16le".equals(encoding) || "utf-32".equals(encoding) || "utf-32be".equals(encoding) || "utf-32le".equals(encoding)) { + if ("utf-16be".equals(encoding) || "utf-16le".equals(encoding)) { this.characterEncoding = Encoding.UTF8; - err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead."); + err(Encoding.msgIgnoredCharset(encoding, "utf-8")); + return true; + } else if ("x-user-defined".equals(encoding)) { + this.characterEncoding = Encoding.WINDOWS1252; + err(Encoding.msgIgnoredCharset("x-user-defined", "windows-1252")); return true; } else { Encoding cs = Encoding.forName(encoding); String canonName = cs.getCanonName(); - if (!cs.isAsciiSuperset()) { - err("The encoding \u201C" - + encoding - + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm."); - return false; - } - if (!cs.isRegistered()) { - if (encoding.startsWith("x-")) { - err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding. (Charmod C022)"); - } else { - err("The encoding \u201C" - + encoding - + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)"); - } - } else if (!cs.getCanonName().equals(encoding)) { - err("The encoding \u201C" + encoding - + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" - + canonName + "\u201D. (Charmod C024)"); - } - if (cs.isShouldNot()) { - warn("Authors should not use the character encoding \u201C" - + encoding - + "\u201D. It is recommended to use \u201CUTF-8\u201D."); - } else if (cs.isObscure()) { - warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); - } - Encoding actual = cs.getActualHtmlEncoding(); - if (actual == null) { + if (!cs.getCanonName().equals(encoding)) { + err(Encoding.msgNotCanonicalName(encoding, canonName)); this.characterEncoding = cs; - } else { - warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D."); - this.characterEncoding = actual; } return true; } } catch (UnsupportedCharsetException e) { - err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing."); + err(Encoding.msgBadInternalCharset(encoding) + + " Will continue sniffing."); } return false; }