diff --git a/src/nu/validator/htmlparser/extra/ChardetSniffer.java b/src/nu/validator/htmlparser/extra/ChardetSniffer.java
index a7575039..4f6c5826 100644
--- a/src/nu/validator/htmlparser/extra/ChardetSniffer.java
+++ b/src/nu/validator/htmlparser/extra/ChardetSniffer.java
@@ -54,7 +54,9 @@ public Encoding sniff() throws IOException {
detector.Init(this);
detector.DoIt(source, length, false);
detector.DataEnd();
- if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
+ if (returnValue != null && returnValue != Encoding.WINDOWS1252
+ && returnValue != Encoding.UTF16BE
+ && returnValue != Encoding.UTF16LE) {
return returnValue;
} else {
return null;
@@ -72,10 +74,6 @@ public static void main(String[] args) {
public void Notify(String charsetName) {
try {
Encoding enc = Encoding.forName(charsetName);
- Encoding actual = enc.getActualHtmlEncoding();
- if (actual != null) {
- enc = actual;
- }
returnValue = enc;
} catch (UnsupportedCharsetException e) {
returnValue = null;
diff --git a/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java b/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java
index f3caab5c..7aa0dde0 100644
--- a/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java
+++ b/src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java
@@ -53,11 +53,8 @@ public Encoding sniff() throws IOException {
detector.setText(this);
CharsetMatch match = detector.detect();
Encoding enc = Encoding.forName(match.getName());
- Encoding actual = enc.getActualHtmlEncoding();
- if (actual != null) {
- enc = actual;
- }
- if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
+ if (enc != Encoding.WINDOWS1252 //
+ && enc != Encoding.UTF16BE && enc != Encoding.UTF16LE) {
return enc;
} else {
return null;
diff --git a/src/nu/validator/htmlparser/io/Driver.java b/src/nu/validator/htmlparser/io/Driver.java
index a8dd387c..b5df79b6 100644
--- a/src/nu/validator/htmlparser/io/Driver.java
+++ b/src/nu/validator/htmlparser/io/Driver.java
@@ -21,6 +21,14 @@
* DEALINGS IN THE SOFTWARE.
*/
+/*
+ * The comments following this one that use the same comment syntax as this
+ * comment are quotes from the HTML Standard at https://html.spec.whatwg.org/
+ * as of 10 September 2020. That document came with this statement:
+ * Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is
+ * licensed under a Creative Commons Attribution 4.0 International License.
+ */
+
package nu.validator.htmlparser.io;
import java.io.IOException;
@@ -214,9 +222,8 @@ public void tokenize(InputSource is, int bufferSize)
tokenizer.getErrorHandler(), tokenizer, this, heuristics);
} else {
if (this.characterEncoding != Encoding.UTF8) {
- errorWithoutLocation("Legacy encoding \u201C"
- + this.characterEncoding.getCanonName()
- + "\u201D used. Documents must use UTF-8.");
+ errorWithoutLocation(Encoding.msgLegacyEncoding(
+ this.characterEncoding.getCanonName()));
}
becomeConfident();
this.reader = new HtmlInputStreamReader(inputStream,
@@ -350,57 +357,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) {
}
}
+ private void errInternalActualDiffer(String internalCharset, String actual)
+ throws SAXException {
+ if (!internalCharset.equals(actual)) {
+ tokenizer.errTreeBuilder(
+ "Ignoring internal encoding declaration \u201C"
+ + internalCharset + "\u201D, which disagrees with"
+ + " the actual encoding of the document (\u201C"
+ + actual + "\u201D).");
+ }
+ }
+
public boolean internalEncodingDeclaration(String internalCharset)
throws SAXException {
+ String actual = characterEncoding.getCanonName();
+ if (confidence == Confidence.CERTAIN) {
+ errInternalActualDiffer(internalCharset, actual);
+ return true;
+ }
+ /* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */
try {
- internalCharset = Encoding.toAsciiLowerCase(internalCharset);
- Encoding cs;
- if ("utf-16".equals(internalCharset)
- || "utf-16be".equals(internalCharset)
+ if ("utf-16be".equals(actual) || "utf-16le".equals(actual)) {
+ errInternalActualDiffer(internalCharset, actual);
+ /*
+ * 1. If the encoding that is already being used to interpret
+ * the input stream is a UTF-16 encoding, then set the
+ * confidence to certain and return. The new encoding is ignored
+ * becomeConfident();
+ */
+ return true;
+ }
+ internalCharset = internalCharset.toLowerCase();
+ Encoding cs = Encoding.forName(internalCharset);
+ if ("utf-16be".equals(internalCharset)
|| "utf-16le".equals(internalCharset)) {
- tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
- + internalCharset
- + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D.");
+ /*
+ * 2. If the new encoding is a UTF-16 encoding, then change it
+ * to UTF-8.
+ */
+ tokenizer.errTreeBuilder(
+ Encoding.msgIgnoredCharset(internalCharset, "utf-8"));
cs = Encoding.UTF8;
internalCharset = "utf-8";
- } else {
- cs = Encoding.forName(internalCharset);
- }
- Encoding actual = cs.getActualHtmlEncoding();
- if (actual == null) {
- actual = cs;
- }
- if (!actual.isAsciiSuperset()) {
- tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
- + internalCharset
- + "\u201D which is not an ASCII superset. Not changing the encoding.");
- return false;
+ } else if ("x-user-defined".equals(internalCharset)) {
+ /*
+ * 3. If the new encoding is x-user-defined, then change it to
+ * windows-1252.
+ */
+ tokenizer.errTreeBuilder(Encoding.msgIgnoredCharset(
+ "x-user-defined", "windows-1252"));
+ cs = Encoding.WINDOWS1252;
+ internalCharset = "windows-1252";
}
if (characterEncoding == null) {
// Reader case
return true;
}
- if (characterEncoding == actual) {
+ if (characterEncoding == cs) {
+ /*
+ * 4. If the new encoding is identical or equivalent to the
+ * encoding that is already being used to interpret the input
+ * stream, then set the confidence to certain and return.
+ */
becomeConfident();
return true;
}
- if (confidence == Confidence.CERTAIN && actual != characterEncoding) {
- tokenizer.errTreeBuilder("Internal encoding declaration \u201C"
- + internalCharset
- + "\u201D disagrees with the actual encoding of the document (\u201C"
- + characterEncoding.getCanonName() + "\u201D).");
- } else {
- Encoding newEnc = whineAboutEncodingAndReturnActual(
- internalCharset, cs);
- tokenizer.errTreeBuilder("Changing character encoding \u201C"
- + internalCharset + "\u201D and reparsing.");
- characterEncoding = newEnc;
- throw new ReparseException();
- }
- return true;
+ /*
+ * 6. Otherwise, navigate to the document again, with
+ * historyHandling set to "replace", and using the same source
+ * browsing context, but this time skip the encoding sniffing
+ * algorithm and instead just set the encoding to the new encoding
+ */
+ Encoding newEnc = whineAboutEncodingAndReturnCanonical(
+ internalCharset, cs);
+ tokenizer.errTreeBuilder("Changing character encoding to \u201C"
+ + internalCharset + "\u201D and reparsing.");
+ characterEncoding = newEnc;
+ // Note: We intentionally don’t call becomeConfident() at this
+ // point. If we did, it would end up causing the exception
+ // java.lang.IllegalStateException: rewind() after willNotRewind()
+ // to be thrown later. So we are departing here from strictly
+ // following the ordering in the corresponding spec language, which
+ // specifies setting the confidence to "certain" at this point.
+ throw new ReparseException();
} catch (UnsupportedCharsetException e) {
- tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C"
- + internalCharset + "\u201D.");
+ tokenizer.errTreeBuilder(
+ Encoding.msgBadInternalCharset(internalCharset));
return false;
}
}
@@ -451,17 +493,16 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
if (encoding == null) {
return null;
}
- encoding = Encoding.toAsciiLowerCase(encoding);
+ encoding = encoding.toLowerCase();
try {
Encoding cs = Encoding.forName(encoding);
- if ("utf-16".equals(cs.getCanonName())
- || "utf-32".equals(cs.getCanonName())) {
+ if ("utf-16be".equals(cs.getCanonName())
+ || "utf-16le".equals(cs.getCanonName())) {
swallowBom = false;
}
- return whineAboutEncodingAndReturnActual(encoding, cs);
+ return whineAboutEncodingAndReturnCanonical(encoding, cs);
} catch (UnsupportedCharsetException e) {
- tokenizer.err("Unsupported character encoding name: \u201C" + encoding
- + "\u201D. Will sniff.");
+ tokenizer.err(Encoding.msgBadEncoding(encoding) + " Will sniff.");
swallowBom = true;
}
return null; // keep the compiler happy
@@ -473,45 +514,13 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
* @return
* @throws SAXException
*/
- protected Encoding whineAboutEncodingAndReturnActual(String encoding,
+ protected Encoding whineAboutEncodingAndReturnCanonical(String encoding,
Encoding cs) throws SAXException {
String canonName = cs.getCanonName();
- if (!cs.isRegistered()) {
- if (encoding.startsWith("x-")) {
- tokenizer.err("The encoding \u201C"
- + encoding
- + "\u201D is not an IANA-registered encoding. (Charmod C022)");
- } else {
- tokenizer.err("The encoding \u201C"
- + encoding
- + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
- }
- } else if (!canonName.equals(encoding)) {
- tokenizer.err("The encoding \u201C"
- + encoding
- + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
- + canonName + "\u201D. (Charmod C024)");
- }
- if (cs.isShouldNot()) {
- tokenizer.warn("Authors should not use the character encoding \u201C"
- + encoding
- + "\u201D. It is recommended to use \u201CUTF-8\u201D.");
- } else if (cs.isLikelyEbcdic()) {
- tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D.");
- } else if (cs.isObscure()) {
- tokenizer.warn("The character encoding \u201C"
- + encoding
- + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
- }
- Encoding actual = cs.getActualHtmlEncoding();
- if (actual == null) {
- return cs;
- } else {
- tokenizer.warn("Using \u201C" + actual.getCanonName()
- + "\u201D instead of the declared encoding \u201C"
- + encoding + "\u201D.");
- return actual;
+ if (!canonName.equals(encoding)) {
+ tokenizer.err(Encoding.msgNotCanonicalName(encoding, canonName));
}
+ return cs;
}
private class ReparseException extends SAXException {
diff --git a/src/nu/validator/htmlparser/io/Encoding.java b/src/nu/validator/htmlparser/io/Encoding.java
index 123465f8..ded6afb4 100644
--- a/src/nu/validator/htmlparser/io/Encoding.java
+++ b/src/nu/validator/htmlparser/io/Encoding.java
@@ -44,422 +44,358 @@ public class Encoding {
public static final Encoding UTF8;
- public static final Encoding UTF16;
-
public static final Encoding UTF16LE;
public static final Encoding UTF16BE;
public static final Encoding WINDOWS1252;
- private static String[] SHOULD_NOT = { "jisx02121990", "xjis0208" };
+ private static Map encodingByLabel =
+ new HashMap();
- private static String[] BANNED = { "bocu1", "cesu8", "compoundtext",
- "iscii91", "macarabic", "maccentraleurroman", "maccroatian",
- "maccyrillic", "macdevanagari", "macfarsi", "macgreek",
- "macgujarati", "macgurmukhi", "machebrew", "macicelandic",
- "macroman", "macromanian", "macthai", "macturkish", "macukranian",
- "scsu", "utf32", "utf32be", "utf32le", "utf7", "ximapmailboxname",
- "xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom",
- "xutf32lebom", "xutf16oppositeendian", "xutf16platformendian",
- "xutf32oppositeendian", "xutf32platformendian" };
+ private static void createEncoding(String name, String[] labels) {
+ if (!Charset.isSupported(name)) {
+ return;
+ }
+ Charset cs = Charset.forName(name);
+ Encoding enc = new Encoding(name.toLowerCase().intern(), cs);
+ for (String label : labels) {
+ encodingByLabel.put(label, enc);
+ }
+ }
- /* From the table at https://encoding.spec.whatwg.org/#names-and-labels,
- * everything in the Labels column, sorted */
- private static String[] NOT_OBSCURE = { //
- "866", //
- "ansi_x3.4-1968", //
- "arabic", //
- "ascii", //
- "asmo-708", //
- "big5", //
- "big5-hkscs", //
- "chinese", //
- "cn-big5", //
- "cp1250", //
- "cp1251", //
- "cp1252", //
- "cp1253", //
- "cp1254", //
- "cp1255", //
- "cp1256", //
- "cp1257", //
- "cp1258", //
- "cp819", //
- "cp866", //
- "csbig5", //
- "cseuckr", //
- "cseucpkdfmtjapanese", //
- "csgb2312", //
- "csibm866", //
- "csiso2022jp", //
- "csiso2022kr", //
- "csiso58gb231280", //
- "csiso88596e", //
- "csiso88596i", //
- "csiso88598e", //
- "csiso88598i", //
- "csisolatin1", //
- "csisolatin2", //
- "csisolatin3", //
- "csisolatin4", //
- "csisolatin5", //
- "csisolatin6", //
- "csisolatin9", //
- "csisolatinarabic", //
- "csisolatincyrillic", //
- "csisolatingreek", //
- "csisolatinhebrew", //
- "cskoi8r", //
- "csksc56011987", //
- "csmacintosh", //
- "csshiftjis", //
- "csunicode", //
- "cyrillic", //
- "dos-874", //
- "ecma-114", //
- "ecma-118", //
- "elot_928", //
- "euc-jp", //
- "euc-kr", //
- "gb18030", //
- "gb2312", //
- "gb_2312", //
- "gb_2312-80", //
- "gbk", //
- "greek", //
- "greek8", //
- "hebrew", //
- "hz-gb-2312", //
- "ibm819", //
- "ibm866", //
- "iso-10646-ucs-2", //
- "iso-2022-cn", //
- "iso-2022-cn-ext", //
- "iso-2022-jp", //
- "iso-2022-kr", //
- "iso-8859-1", //
- "iso-8859-10", //
- "iso-8859-11", //
- "iso-8859-13", //
- "iso-8859-14", //
- "iso-8859-15", //
- "iso-8859-16", //
- "iso-8859-2", //
- "iso-8859-3", //
- "iso-8859-4", //
- "iso-8859-5", //
- "iso-8859-6", //
- "iso-8859-6-e", //
- "iso-8859-6-i", //
- "iso-8859-7", //
- "iso-8859-8", //
- "iso-8859-8-e", //
- "iso-8859-8-i", //
- "iso-8859-9", //
- "iso-ir-100", //
- "iso-ir-101", //
- "iso-ir-109", //
- "iso-ir-110", //
- "iso-ir-126", //
- "iso-ir-127", //
- "iso-ir-138", //
- "iso-ir-144", //
- "iso-ir-148", //
- "iso-ir-149", //
- "iso-ir-157", //
- "iso-ir-58", //
- "iso8859-1", //
- "iso8859-10", //
- "iso8859-11", //
- "iso8859-13", //
- "iso8859-14", //
- "iso8859-15", //
- "iso8859-2", //
- "iso8859-3", //
- "iso8859-4", //
- "iso8859-5", //
- "iso8859-6", //
- "iso8859-7", //
- "iso8859-8", //
- "iso8859-9", //
- "iso88591", //
- "iso885910", //
- "iso885911", //
- "iso885913", //
- "iso885914", //
- "iso885915", //
- "iso88592", //
- "iso88593", //
- "iso88594", //
- "iso88595", //
- "iso88596", //
- "iso88597", //
- "iso88598", //
- "iso88599", //
- "iso_8859-1", //
- "iso_8859-15", //
- "iso_8859-1:1987", //
- "iso_8859-2", //
- "iso_8859-2:1987", //
- "iso_8859-3", //
- "iso_8859-3:1988", //
- "iso_8859-4", //
- "iso_8859-4:1988", //
- "iso_8859-5", //
- "iso_8859-5:1988", //
- "iso_8859-6", //
- "iso_8859-6:1987", //
- "iso_8859-7", //
- "iso_8859-7:1987", //
- "iso_8859-8", //
- "iso_8859-8:1988", //
- "iso_8859-9", //
- "iso_8859-9:1989", //
- "koi", //
- "koi8", //
- "koi8-r", //
- "koi8-ru", //
- "koi8-u", //
- "koi8_r", //
- "korean", //
- "ks_c_5601-1987", //
- "ks_c_5601-1989", //
- "ksc5601", //
- "ksc_5601", //
- "l1", //
- "l2", //
- "l3", //
- "l4", //
- "l5", //
- "l6", //
- "l9", //
- "latin1", //
- "latin2", //
- "latin3", //
- "latin4", //
- "latin5", //
- "latin6", //
- "logical", //
- "mac", //
- "macintosh", //
- "ms932", //
- "ms_kanji", //
- "replacement", //
- "shift-jis", //
- "shift_jis", //
- "sjis", //
- "sun_eu_greek", //
- "tis-620", //
- "ucs-2", //
- "unicode", //
- "unicode-1-1-utf-8", //
- "unicode11utf8", //
- "unicode20utf8", //
- "unicodefeff", //
- "unicodefffe", //
- "us-ascii", //
- "utf-16", //
- "utf-16be", //
- "utf-16le", //
- "utf-8", //
- "utf8", //
- "visual", //
- "windows-1250", //
- "windows-1251", //
- "windows-1252", //
- "windows-1253", //
- "windows-1254", //
- "windows-1255", //
- "windows-1256", //
- "windows-1257", //
- "windows-1258", //
- "windows-31j", //
- "windows-874", //
- "windows-949", //
- "x-cp1250", //
- "x-cp1251", //
- "x-cp1252", //
- "x-cp1253", //
- "x-cp1254", //
- "x-cp1255", //
- "x-cp1256", //
- "x-cp1257", //
- "x-cp1258", //
- "x-euc-jp", //
- "x-gbk", //
- "x-mac-cyrillic", //
- "x-mac-roman", //
- "x-mac-ukrainian", //
- "x-sjis", //
- "x-unicode20utf8", //
- "x-user-defined", //
- "x-x-big5", //
- };
- private static Map encodingByCookedName = new HashMap();
+ static {
+ /* See https://encoding.spec.whatwg.org/#names-and-labels */
+ createEncoding( //
+ "UTF-8", new String[] { //
+ "unicode-1-1-utf-8", //
+ "unicode11utf8", //
+ "unicode20utf8", //
+ "utf-8", //
+ "utf8", //
+ "x-unicode20utf8" });
+ createEncoding( //
+ "IBM866", new String[] { //
+ "866", //
+ "cp866", //
+ "csibm866", //
+ "ibm866" });
+ createEncoding( //
+ "ISO-8859-2", new String[] { //
+ "csisolatin2", //
+ "iso-8859-2", //
+ "iso-ir-101", //
+ "iso8859-2", //
+ "iso88592", //
+ "iso_8859-2", //
+ "iso_8859-2:1987", //
+ "l2", //
+ "latin2" });
+ createEncoding( //
+ "ISO-8859-3", new String[] { //
+ "csisolatin3", //
+ "iso-8859-3", //
+ "iso-ir-109", //
+ "iso8859-3", //
+ "iso88593", //
+ "iso_8859-3", //
+ "iso_8859-3:1988", //
+ "l3", //
+ "latin3" });
+ createEncoding( //
+ "ISO-8859-4", new String[] { //
+ "csisolatin4", //
+ "iso-8859-4", //
+ "iso-ir-110", //
+ "iso8859-4", //
+ "iso88594", //
+ "iso_8859-4", //
+ "iso_8859-4:1988", //
+ "l4", //
+ "latin4" });
+ createEncoding( //
+ "ISO-8859-5", new String[] { //
+ "csisolatincyrillic", //
+ "cyrillic", //
+ "iso-8859-5", //
+ "iso-ir-144", //
+ "iso8859-5", //
+ "iso88595", //
+ "iso_8859-5", //
+ "iso_8859-5:1988" });
+ createEncoding( //
+ "ISO-8859-6", new String[] { //
+ "arabic", //
+ "asmo-708", //
+ "csiso88596e", //
+ "csiso88596i", //
+ "csisolatinarabic", //
+ "ecma-114", //
+ "iso-8859-6", //
+ "iso-8859-6-e", //
+ "iso-8859-6-i", //
+ "iso-ir-127", //
+ "iso8859-6", //
+ "iso88596", //
+ "iso_8859-6", //
+ "iso_8859-6:1987" });
+ createEncoding( //
+ "ISO-8859-7", new String[] { //
+ "csisolatingreek", //
+ "ecma-118", //
+ "elot_928", //
+ "greek", //
+ "greek8", //
+ "iso-8859-7", //
+ "iso-ir-126", //
+ "iso8859-7", //
+ "iso88597", //
+ "iso_8859-7", //
+ "iso_8859-7:1987", //
+ "sun_eu_greek" });
+ createEncoding( //
+ "ISO-8859-8", new String[] { //
+ "csiso88598e", //
+ "csisolatinhebrew", //
+ "hebrew", //
+ "iso-8859-8", //
+ "iso-8859-8-e", //
+ "iso-ir-138", //
+ "iso8859-8", //
+ "iso88598", //
+ "iso_8859-8", //
+ "iso_8859-8:1988", //
+ "visual" });
+ createEncoding( //
+ // Unsupported in Java
+ "ISO-8859-8-I", new String[] { //
+ "csiso88598i", //
+ "iso-8859-8-i", //
+ "logical" });
+ createEncoding( //
+ // Unsupported in Java
+ "ISO-8859-10", new String[] { //
+ "csisolatin6", //
+ "iso-8859-10", //
+ "iso-ir-157", //
+ "iso8859-10", //
+ "iso885910", //
+ "l6", //
+ "latin6" });
+ createEncoding( //
+ "ISO-8859-13", new String[] { //
+ "iso-8859-13", //
+ "iso8859-13", //
+ "iso885913" });
+ createEncoding( //
+ // Unsupported in Java
+ "ISO-8859-14", new String[] { //
+ "iso-8859-14", //
+ "iso8859-14", //
+ "iso885914" });
+ createEncoding( //
+ "ISO-8859-15", new String[] { //
+ "csisolatin9", //
+ "iso-8859-15", //
+ "iso8859-15", //
+ "iso885915", //
+ "iso_8859-15", //
+ "l9" });
+ createEncoding( //
+ "ISO-8859-16", new String[] { //
+ "iso-8859-16" });
+ createEncoding( //
+ "KOI8-R", new String[] { //
+ "cskoi8r", //
+ "koi", //
+ "koi8", //
+ "koi8-r", //
+ "koi8_r" });
+ createEncoding( //
+ "KOI8-U", new String[] { //
+ "koi8-ru", //
+ "koi8-u" });
+ createEncoding( //
+ // Unsupported in Java
+ "macintosh", new String[] { //
+ "csmacintosh", //
+ "mac", //
+ "macintosh", //
+ "x-mac-roman" });
+ createEncoding( //
+ "windows-874", new String[] { //
+ "dos-874", //
+ "iso-8859-11", //
+ "iso8859-11", //
+ "iso885911", //
+ "tis-620", //
+ "windows-874" });
+ createEncoding( //
+ "windows-1250", new String[] { //
+ "cp1250", //
+ "windows-1250", //
+ "x-cp1250" });
+ createEncoding( //
+ "windows-1251", new String[] { //
+ "cp1251", //
+ "windows-1251", //
+ "x-cp1251" });
+ createEncoding( //
+ "windows-1252", new String[] { //
+ "ansi_x3.4-1968", //
+ "ascii", //
+ "cp1252", //
+ "cp819", //
+ "csisolatin1", //
+ "ibm819", //
+ "iso-8859-1", //
+ "iso-ir-100", //
+ "iso8859-1", //
+ "iso88591", //
+ "iso_8859-1", //
+ "iso_8859-1:1987", //
+ "l1", //
+ "latin1", //
+ "us-ascii", //
+ "windows-1252", //
+ "x-cp1252" });
+ createEncoding( //
+ "windows-1253", new String[] { //
+ "cp1253", //
+ "windows-1253", //
+ "x-cp1253" });
+ createEncoding( //
+ "windows-1254", new String[] { //
+ "cp1254", //
+ "csisolatin5", //
+ "iso-8859-9", //
+ "iso-ir-148", //
+ "iso8859-9", //
+ "iso88599", //
+ "iso_8859-9", //
+ "iso_8859-9:1989", //
+ "l5", //
+ "latin5", //
+ "windows-1254", //
+ "x-cp1254" });
+ createEncoding( //
+ "windows-1255", new String[] { //
+ "cp1255", //
+ "windows-1255", //
+ "x-cp1255" });
+ createEncoding( //
+ "windows-1256", new String[] { //
+ "cp1256", //
+ "windows-1256", //
+ "x-cp1256" });
+ createEncoding( //
+ "windows-1257", new String[] { //
+ "cp1257", //
+ "windows-1257", //
+ "x-cp1257" });
+ createEncoding( //
+ "windows-1258", new String[] { //
+ "cp1258", //
+ "windows-1258", //
+ "x-cp1258" });
+ createEncoding( //
+ // Unsupported in Java
+ "x-mac-cyrillic", new String[] { //
+ "x-mac-cyrillic", //
+ "x-mac-ukrainian" });
+ createEncoding( //
+ "GBK", new String[] { //
+ "chinese", //
+ "csgb2312", //
+ "csiso58gb231280", //
+ "gb2312", //
+ "gb_2312", //
+ "gb_2312-80", //
+ "gbk", //
+ "iso-ir-58", //
+ "x-gbk" });
+ createEncoding( //
+ "gb18030", new String[] { //
+ "gb18030" });
+ createEncoding( //
+ "Big5", new String[] { //
+ "big5", //
+ "big5-hkscs", //
+ "cn-big5", //
+ "csbig5", //
+ "x-x-big5" });
+ createEncoding( //
+ "EUC-JP", new String[] { //
+ "cseucpkdfmtjapanese", //
+ "euc-jp", //
+ "x-euc-jp" });
+ createEncoding( //
+ "ISO-2022-JP", new String[] { //
+ "csiso2022jp", //
+ "iso-2022-jp" });
+ createEncoding( //
+ "Shift_JIS", new String[] { //
+ "csshiftjis", //
+ "ms932", //
+ "ms_kanji", //
+ "shift-jis", //
+ "shift_jis", //
+ "sjis", //
+ "windows-31j", //
+ "x-sjis" });
+ createEncoding( //
+ "EUC-KR", new String[] { //
+ "cseuckr", //
+ "csksc56011987", //
+ "euc-kr", //
+ "iso-ir-149", //
+ "korean", //
+ "ks_c_5601-1987", //
+ "ks_c_5601-1989", //
+ "ksc5601", //
+ "ksc_5601", //
+ "windows-949" });
+ createEncoding( //
+ // Special case
+ "replacement", new String[] { //
+ "csiso2022kr", //
+ "hz-gb-2312", //
+ "iso-2022-cn", //
+ "iso-2022-cn-ext", //
+ "iso-2022-kr", //
+ "replacement" });
+ createEncoding( //
+ "UTF-16BE", new String[] { //
+ "unicodefffe", //
+ "utf-16be" });
+ createEncoding( //
+ "UTF-16LE", new String[] { //
+ "csunicode", //
+ "iso-10646-ucs-2", //
+ "ucs-2", //
+ "unicode", //
+ "unicodefeff", //
+ "utf-16", //
+ "utf-16le" });
+ createEncoding( //
+ // Special case
+ "x-user-defined", new String[] { //
+ "x-user-defined" });
+ }
private final String canonName;
private final Charset charset;
- private final boolean asciiSuperset;
-
- private final boolean obscure;
-
- private final boolean shouldNot;
-
- private final boolean likelyEbcdic;
-
- private Encoding actualHtmlEncoding = null;
-
static {
- byte[] testBuf = new byte[0x7F];
- for (int i = 0; i < 0x7F; i++) {
- if (isAsciiSupersetnessSensitive(i)) {
- testBuf[i] = (byte) i;
- } else {
- testBuf[i] = (byte) 0x20;
- }
- }
-
- Set encodings = new HashSet();
-
- SortedMap charsets = Charset.availableCharsets();
- for (Map.Entry entry : charsets.entrySet()) {
- Charset cs = entry.getValue();
- String name = toNameKey(cs.name());
- String canonName = toAsciiLowerCase(cs.name());
- if (!isBanned(stripDashAndUnderscore(name))) {
- name = name.intern();
- boolean asciiSuperset = asciiMapsToBasicLatin(testBuf, cs);
- Encoding enc = new Encoding(canonName.intern(), cs,
- asciiSuperset, isObscure(name),
- isShouldNot(stripDashAndUnderscore(name)),
- isLikelyEbcdic(name, asciiSuperset));
- encodings.add(enc);
- Set aliases = cs.aliases();
- for (String alias : aliases) {
- encodingByCookedName.put(toNameKey(alias).intern(), enc);
- }
- }
- }
- // Overwrite possible overlapping aliases with the real things--just in
- // case
- for (Encoding encoding : encodings) {
- encodingByCookedName.put(toNameKey(encoding.getCanonName()),
- encoding);
- }
UTF8 = forName("utf-8");
- UTF16 = forName("utf-16");
UTF16BE = forName("utf-16be");
UTF16LE = forName("utf-16le");
WINDOWS1252 = forName("windows-1252");
- try {
- forName("iso-8859-1").actualHtmlEncoding = forName("windows-1252");
- } catch (UnsupportedCharsetException e) {
- }
- try {
- forName("iso-8859-9").actualHtmlEncoding = forName("windows-1254");
- } catch (UnsupportedCharsetException e) {
- }
- try {
- forName("iso-8859-11").actualHtmlEncoding = forName("windows-874");
- } catch (UnsupportedCharsetException e) {
- }
- try {
- forName("x-iso-8859-11").actualHtmlEncoding = forName("windows-874");
- } catch (UnsupportedCharsetException e) {
- }
- try {
- forName("tis-620").actualHtmlEncoding = forName("windows-874");
- } catch (UnsupportedCharsetException e) {
- }
- try {
- forName("gb_2312-80").actualHtmlEncoding = forName("gbk");
- } catch (UnsupportedCharsetException e) {
- }
- try {
- forName("gb2312").actualHtmlEncoding = forName("gbk");
- } catch (UnsupportedCharsetException e) {
- }
- try {
- encodingByCookedName.put("x-x-big5", forName("big5"));
- } catch (UnsupportedCharsetException e) {
- }
- try {
- encodingByCookedName.put("euc-kr", forName("windows-949"));
- } catch (UnsupportedCharsetException e) {
- }
- try {
- encodingByCookedName.put("ks_c_5601-1987", forName("windows-949"));
- } catch (UnsupportedCharsetException e) {
- }
- }
-
- private static boolean isAsciiSupersetnessSensitive(int c) {
- return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22)
- || (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F)
- || (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
- }
-
- private static boolean isObscure(String lowerCasePreferredIanaName) {
- return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1);
- }
-
- private static boolean isBanned(String lowerCasePreferredIanaName) {
- if (lowerCasePreferredIanaName.startsWith("xibm")) {
- return true;
- }
- return (Arrays.binarySearch(BANNED, lowerCasePreferredIanaName) > -1);
- }
-
- private static boolean isShouldNot(String lowerCasePreferredIanaName) {
- return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1);
- }
-
- /**
- * @param testBuf
- * @param cs
- */
- private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
- CharsetDecoder dec = cs.newDecoder();
- dec.onMalformedInput(CodingErrorAction.REPORT);
- dec.onUnmappableCharacter(CodingErrorAction.REPORT);
- Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
- try {
- for (int i = 0; i < 0x7F; i++) {
- if (isAsciiSupersetnessSensitive(i)) {
- if (r.read() != i) {
- return false;
- }
- } else {
- if (r.read() != 0x20) {
- return false;
- }
- }
- }
- } catch (IOException e) {
- return false;
- } catch (Exception e) {
- return false;
- } catch (CoderMalfunctionError e) {
- return false;
- }
-
- return true;
- }
-
- private static boolean isLikelyEbcdic(String canonName,
- boolean asciiSuperset) {
- if (!asciiSuperset) {
- return (canonName.startsWith("cp") || canonName.startsWith("ibm") || canonName.startsWith("xibm"));
- } else {
- return false;
- }
}
public static Encoding forName(String name) {
- Encoding rv = encodingByCookedName.get(toNameKey(name));
+ Encoding rv = encodingByLabel.get(toNameKey(name));
if (rv == null) {
throw new UnsupportedCharsetException(name);
} else {
@@ -486,61 +422,13 @@ public static String toNameKey(String str) {
return new String(buf, 0, j);
}
- public static String stripDashAndUnderscore(String str) {
- if (str == null) {
- return null;
- }
- char[] buf = new char[str.length()];
- for (int i = 0; i < str.length(); i++) {
- char c = str.charAt(i);
- if (c == '-' || c == '_') {
- buf[i] = c;
- }
- }
- return new String(buf);
- }
-
- public static String toAsciiLowerCase(String str) {
- if (str == null) {
- return null;
- }
- char[] buf = new char[str.length()];
- for (int i = 0; i < str.length(); i++) {
- char c = str.charAt(i);
- if (c >= 'A' && c <= 'Z') {
- c += 0x20;
- }
- buf[i] = c;
- }
- return new String(buf);
- }
-
/**
* @param canonName
* @param charset
- * @param asciiSuperset
- * @param obscure
- * @param shouldNot
- * @param likelyEbcdic
*/
- private Encoding(final String canonName, final Charset charset,
- final boolean asciiSuperset, final boolean obscure,
- final boolean shouldNot, final boolean likelyEbcdic) {
+ private Encoding(final String canonName, final Charset charset) {
this.canonName = canonName;
this.charset = charset;
- this.asciiSuperset = asciiSuperset;
- this.obscure = obscure;
- this.shouldNot = shouldNot;
- this.likelyEbcdic = likelyEbcdic;
- }
-
- /**
- * Returns the asciiSuperset.
- *
- * @return the asciiSuperset
- */
- public boolean isAsciiSuperset() {
- return asciiSuperset;
}
/**
@@ -552,37 +440,6 @@ public String getCanonName() {
return canonName;
}
- /**
- * Returns the likelyEbcdic.
- *
- * @return the likelyEbcdic
- */
- public boolean isLikelyEbcdic() {
- return likelyEbcdic;
- }
-
- /**
- * Returns the obscure.
- *
- * @return the obscure
- */
- public boolean isObscure() {
- return obscure;
- }
-
- /**
- * Returns the shouldNot.
- *
- * @return the shouldNot
- */
- public boolean isShouldNot() {
- return shouldNot;
- }
-
- public boolean isRegistered() {
- return !canonName.startsWith("x-");
- }
-
/**
* @return
* @see java.nio.charset.Charset#canEncode()
@@ -607,24 +464,36 @@ public CharsetEncoder newEncoder() {
return charset.newEncoder();
}
- /**
- * Returns the actualHtmlEncoding.
- *
- * @return the actualHtmlEncoding
- */
- public Encoding getActualHtmlEncoding() {
- return actualHtmlEncoding;
+ protected static String msgLegacyEncoding(String name) {
+ return "Legacy encoding \u201C" + name + "\u201D used. Documents must"
+ + " use UTF-8.";
+ }
+
+ protected static String msgIgnoredCharset(String ignored, String name) {
+ return "Internal encoding declaration specified \u201C" + ignored
+ + "\u201D. Continuing as if the encoding had been \u201C"
+ + name + "\u201D.";
+ }
+ protected static String msgNotCanonicalName(String label, String name) {
+ return "The encoding \u201C" + label + "\u201D is not the canonical"
+ + " name of the character encoding in use. The canonical name"
+ + " is \u201C" + name + "\u201D. (Charmod C024)";
+ }
+
+ protected static String msgBadInternalCharset(String internalCharset) {
+ return "Internal encoding declaration named an unsupported character"
+ + " encoding \u201C" + internalCharset + "\u201D.";
+ }
+
+ protected static String msgBadEncoding(String name) {
+ return "Unsupported character encoding name: \u201C" + name + "\u201D.";
}
public static void main(String[] args) {
- for (Map.Entry entry : encodingByCookedName.entrySet()) {
+ for (Map.Entry entry : encodingByLabel.entrySet()) {
String name = entry.getKey();
Encoding enc = entry.getValue();
- System.out.printf(
- "%21s: canon %21s, obs %5s, reg %5s, asc %5s, ebc %5s\n",
- name, enc.getCanonName(), enc.isObscure(),
- enc.isRegistered(), enc.isAsciiSuperset(),
- enc.isLikelyEbcdic());
+ System.out.printf("%21s: canon %13s\n", name, enc.getCanonName());
}
}
diff --git a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java
index 4facce4a..c54e591a 100755
--- a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java
+++ b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java
@@ -139,9 +139,7 @@ public HtmlInputStreamReader(InputStream inputStream,
if (encoding == null) {
declared = false;
} else if (encoding != Encoding.UTF8) {
- err("Legacy encoding \u201C"
- + encoding.getCanonName()
- + "\u201D used. Documents must use UTF-8.");
+ err(Encoding.msgLegacyEncoding(encoding.getCanonName()));
}
if (encoding == null
&& (heuristics == Heuristics.CHARDET || heuristics == Heuristics.ALL)) {
@@ -157,7 +155,8 @@ public HtmlInputStreamReader(InputStream inputStream,
encoding = Encoding.WINDOWS1252;
}
if (!declared) {
- err("The character encoding was not declared. Proceeding using \u201C" + encoding.getCanonName() + "\u201D.");
+ err("The character encoding was not declared. Proceeding using"
+ + " \u201C" + encoding.getCanonName() + "\u201D.");
}
if (driver != null) {
driver.setEncoding(encoding, Confidence.TENTATIVE);
@@ -168,11 +167,10 @@ public HtmlInputStreamReader(InputStream inputStream,
driver.setEncoding(Encoding.UTF8, Confidence.CERTAIN);
}
} else {
- err("Legacy encoding \u201C"
- + encoding.getCanonName()
- + "\u201D used. Documents must use UTF-8.");
+ err(Encoding.msgLegacyEncoding(encoding.getCanonName()));
if (driver != null) {
- driver.setEncoding(Encoding.UTF16, Confidence.CERTAIN);
+ // XXX Why did we do driver.setEncoding(encoding.UTF16... ?
+ driver.setEncoding(encoding, Confidence.CERTAIN);
}
}
}
diff --git a/src/nu/validator/htmlparser/io/MetaSniffer.java b/src/nu/validator/htmlparser/io/MetaSniffer.java
index 9deaef7a..60e157d0 100755
--- a/src/nu/validator/htmlparser/io/MetaSniffer.java
+++ b/src/nu/validator/htmlparser/io/MetaSniffer.java
@@ -159,55 +159,28 @@ public String getEncoding() {
}
protected boolean tryCharset(String encoding) throws SAXException {
- encoding = Encoding.toAsciiLowerCase(encoding);
+ encoding = encoding.toLowerCase();
try {
- // XXX spec says only UTF-16
- if ("utf-16".equals(encoding) || "utf-16be".equals(encoding) || "utf-16le".equals(encoding) || "utf-32".equals(encoding) || "utf-32be".equals(encoding) || "utf-32le".equals(encoding)) {
+ if ("utf-16be".equals(encoding) || "utf-16le".equals(encoding)) {
this.characterEncoding = Encoding.UTF8;
- err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead.");
+ err(Encoding.msgIgnoredCharset(encoding, "utf-8"));
+ return true;
+ } else if ("x-user-defined".equals(encoding)) {
+ this.characterEncoding = Encoding.WINDOWS1252;
+ err(Encoding.msgIgnoredCharset("x-user-defined", "windows-1252"));
return true;
} else {
Encoding cs = Encoding.forName(encoding);
String canonName = cs.getCanonName();
- if (!cs.isAsciiSuperset()) {
- err("The encoding \u201C"
- + encoding
- + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
- return false;
- }
- if (!cs.isRegistered()) {
- if (encoding.startsWith("x-")) {
- err("The encoding \u201C"
- + encoding
- + "\u201D is not an IANA-registered encoding. (Charmod C022)");
- } else {
- err("The encoding \u201C"
- + encoding
- + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)");
- }
- } else if (!cs.getCanonName().equals(encoding)) {
- err("The encoding \u201C" + encoding
- + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
- + canonName + "\u201D. (Charmod C024)");
- }
- if (cs.isShouldNot()) {
- warn("Authors should not use the character encoding \u201C"
- + encoding
- + "\u201D. It is recommended to use \u201CUTF-8\u201D.");
- } else if (cs.isObscure()) {
- warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
- }
- Encoding actual = cs.getActualHtmlEncoding();
- if (actual == null) {
+ if (!cs.getCanonName().equals(encoding)) {
+ err(Encoding.msgNotCanonicalName(encoding, canonName));
this.characterEncoding = cs;
- } else {
- warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D.");
- this.characterEncoding = actual;
}
return true;
}
} catch (UnsupportedCharsetException e) {
- err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
+ err(Encoding.msgBadInternalCharset(encoding)
+ + " Will continue sniffing.");
}
return false;
}