Skip to content

Commit eab3602

Browse files
Drop/replace “is ASCII superset”-checking code
This change drops or replaces all code for checking whether a particular encoding is an ASCII superset — because the code no longer corresponds to actual requirements in the Encoding spec (which instead now requires checking only whether an encoding is utf-16be or utf-16le).
1 parent 208fd51 commit eab3602

File tree

5 files changed

+5
-72
lines changed

5 files changed

+5
-72
lines changed

src/nu/validator/htmlparser/extra/ChardetSniffer.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ public Encoding sniff() throws IOException {
5454
detector.Init(this);
5555
detector.DoIt(source, length, false);
5656
detector.DataEnd();
57-
if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
57+
if (returnValue != null && returnValue != Encoding.WINDOWS1252
58+
&& returnValue != Encoding.UTF16BE
59+
&& returnValue != Encoding.UTF16LE) {
5860
return returnValue;
5961
} else {
6062
return null;

src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ public Encoding sniff() throws IOException {
5757
if (actual != null) {
5858
enc = actual;
5959
}
60-
if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
60+
if (enc != Encoding.WINDOWS1252 //
61+
&& enc != Encoding.UTF16BE && enc != Encoding.UTF16LE) {
6162
return enc;
6263
} else {
6364
return null;

src/nu/validator/htmlparser/io/Driver.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -370,12 +370,6 @@ public boolean internalEncodingDeclaration(String internalCharset)
370370
if (actual == null) {
371371
actual = cs;
372372
}
373-
if (!actual.isAsciiSuperset()) {
374-
tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
375-
+ internalCharset
376-
+ "\u201D which is not an ASCII superset. Not changing the encoding.");
377-
return false;
378-
}
379373
if (characterEncoding == null) {
380374
// Reader case
381375
return true;

src/nu/validator/htmlparser/io/Encoding.java

Lines changed: 0 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -304,8 +304,6 @@ public class Encoding {
304304

305305
private final Charset charset;
306306

307-
private final boolean asciiSuperset;
308-
309307
private final boolean obscure;
310308

311309
private final boolean shouldNot;
@@ -315,15 +313,6 @@ public class Encoding {
315313
private Encoding actualHtmlEncoding = null;
316314

317315
static {
318-
byte[] testBuf = new byte[0x7F];
319-
for (int i = 0; i < 0x7F; i++) {
320-
if (isAsciiSupersetnessSensitive(i)) {
321-
testBuf[i] = (byte) i;
322-
} else {
323-
testBuf[i] = (byte) 0x20;
324-
}
325-
}
326-
327316
Set<Encoding> encodings = new HashSet<Encoding>();
328317

329318
SortedMap<String, Charset> charsets = Charset.availableCharsets();
@@ -398,12 +387,6 @@ asciiSuperset, isObscure(name),
398387
}
399388
}
400389

401-
private static boolean isAsciiSupersetnessSensitive(int c) {
402-
return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22)
403-
|| (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F)
404-
|| (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
405-
}
406-
407390
private static boolean isObscure(String lowerCasePreferredIanaName) {
408391
return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1);
409392
}
@@ -419,38 +402,6 @@ private static boolean isShouldNot(String lowerCasePreferredIanaName) {
419402
return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1);
420403
}
421404

422-
/**
423-
* @param testBuf
424-
* @param cs
425-
*/
426-
private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
427-
CharsetDecoder dec = cs.newDecoder();
428-
dec.onMalformedInput(CodingErrorAction.REPORT);
429-
dec.onUnmappableCharacter(CodingErrorAction.REPORT);
430-
Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
431-
try {
432-
for (int i = 0; i < 0x7F; i++) {
433-
if (isAsciiSupersetnessSensitive(i)) {
434-
if (r.read() != i) {
435-
return false;
436-
}
437-
} else {
438-
if (r.read() != 0x20) {
439-
return false;
440-
}
441-
}
442-
}
443-
} catch (IOException e) {
444-
return false;
445-
} catch (Exception e) {
446-
return false;
447-
} catch (CoderMalfunctionError e) {
448-
return false;
449-
}
450-
451-
return true;
452-
}
453-
454405
private static boolean isLikelyEbcdic(String canonName,
455406
boolean asciiSuperset) {
456407
if (!asciiSuperset) {
@@ -536,15 +487,6 @@ private Encoding(final String canonName, final Charset charset,
536487
this.likelyEbcdic = likelyEbcdic;
537488
}
538489

539-
/**
540-
* Returns the asciiSuperset.
541-
*
542-
* @return the asciiSuperset
543-
*/
544-
public boolean isAsciiSuperset() {
545-
return asciiSuperset;
546-
}
547-
548490
/**
549491
* Returns the canonName.
550492
*

src/nu/validator/htmlparser/io/MetaSniffer.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -169,12 +169,6 @@ protected boolean tryCharset(String encoding) throws SAXException {
169169
} else {
170170
Encoding cs = Encoding.forName(encoding);
171171
String canonName = cs.getCanonName();
172-
if (!cs.isAsciiSuperset()) {
173-
err("The encoding \u201C"
174-
+ encoding
175-
+ "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
176-
return false;
177-
}
178172
if (!cs.isRegistered()) {
179173
if (encoding.startsWith("x-")) {
180174
err("The encoding \u201C"

0 commit comments

Comments
 (0)