Skip to content

Commit 03954fb

Browse files
Drop/replace “is ASCII superset”-checking code
This change drops or replaces all code for checking whether a particular encoding is an ASCII superset — because the code no longer corresponds to actual requirements in the Encoding spec (which instead now requires checking only whether an encoding is utf-16be or utf-16le).
1 parent 3b7c04e commit 03954fb

File tree

5 files changed

+5
-72
lines changed

5 files changed

+5
-72
lines changed

src/nu/validator/htmlparser/extra/ChardetSniffer.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ public Encoding sniff() throws IOException {
5454
detector.Init(this);
5555
detector.DoIt(source, length, false);
5656
detector.DataEnd();
57-
if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
57+
if (returnValue != null && returnValue != Encoding.WINDOWS1252
58+
&& returnValue != Encoding.UTF16BE
59+
&& returnValue != Encoding.UTF16LE) {
5860
return returnValue;
5961
} else {
6062
return null;

src/nu/validator/htmlparser/extra/IcuDetectorSniffer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ public Encoding sniff() throws IOException {
5757
if (actual != null) {
5858
enc = actual;
5959
}
60-
if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
60+
if (enc != Encoding.WINDOWS1252 //
61+
&& enc != Encoding.UTF16BE && enc != Encoding.UTF16LE) {
6162
return enc;
6263
} else {
6364
return null;

src/nu/validator/htmlparser/io/Driver.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -353,12 +353,6 @@ public boolean internalEncodingDeclaration(String internalCharset)
353353
if (actual == null) {
354354
actual = cs;
355355
}
356-
if (!actual.isAsciiSuperset()) {
357-
tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C"
358-
+ internalCharset
359-
+ "\u201D which is not an ASCII superset. Not changing the encoding.");
360-
return false;
361-
}
362356
if (characterEncoding == null) {
363357
// Reader case
364358
return true;

src/nu/validator/htmlparser/io/Encoding.java

Lines changed: 0 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,6 @@ public class Encoding {
7979

8080
private final Charset charset;
8181

82-
private final boolean asciiSuperset;
83-
8482
private final boolean obscure;
8583

8684
private final boolean shouldNot;
@@ -90,15 +88,6 @@ public class Encoding {
9088
private Encoding actualHtmlEncoding = null;
9189

9290
static {
93-
byte[] testBuf = new byte[0x7F];
94-
for (int i = 0; i < 0x7F; i++) {
95-
if (isAsciiSupersetnessSensitive(i)) {
96-
testBuf[i] = (byte) i;
97-
} else {
98-
testBuf[i] = (byte) 0x20;
99-
}
100-
}
101-
10291
Set<Encoding> encodings = new HashSet<Encoding>();
10392

10493
SortedMap<String, Charset> charsets = Charset.availableCharsets();
@@ -172,12 +161,6 @@ asciiSuperset, isObscure(name), isShouldNot(name),
172161
}
173162
}
174163

175-
private static boolean isAsciiSupersetnessSensitive(int c) {
176-
return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22)
177-
|| (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F)
178-
|| (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
179-
}
180-
181164
private static boolean isObscure(String lowerCasePreferredIanaName) {
182165
return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1);
183166
}
@@ -193,38 +176,6 @@ private static boolean isShouldNot(String lowerCasePreferredIanaName) {
193176
return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1);
194177
}
195178

196-
/**
197-
* @param testBuf
198-
* @param cs
199-
*/
200-
private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
201-
CharsetDecoder dec = cs.newDecoder();
202-
dec.onMalformedInput(CodingErrorAction.REPORT);
203-
dec.onUnmappableCharacter(CodingErrorAction.REPORT);
204-
Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
205-
try {
206-
for (int i = 0; i < 0x7F; i++) {
207-
if (isAsciiSupersetnessSensitive(i)) {
208-
if (r.read() != i) {
209-
return false;
210-
}
211-
} else {
212-
if (r.read() != 0x20) {
213-
return false;
214-
}
215-
}
216-
}
217-
} catch (IOException e) {
218-
return false;
219-
} catch (Exception e) {
220-
return false;
221-
} catch (CoderMalfunctionError e) {
222-
return false;
223-
}
224-
225-
return true;
226-
}
227-
228179
private static boolean isLikelyEbcdic(String canonName,
229180
boolean asciiSuperset) {
230181
if (!asciiSuperset) {
@@ -298,15 +249,6 @@ private Encoding(final String canonName, final Charset charset,
298249
this.likelyEbcdic = likelyEbcdic;
299250
}
300251

301-
/**
302-
* Returns the asciiSuperset.
303-
*
304-
* @return the asciiSuperset
305-
*/
306-
public boolean isAsciiSuperset() {
307-
return asciiSuperset;
308-
}
309-
310252
/**
311253
* Returns the canonName.
312254
*

src/nu/validator/htmlparser/io/MetaSniffer.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -169,12 +169,6 @@ protected boolean tryCharset(String encoding) throws SAXException {
169169
} else {
170170
Encoding cs = Encoding.forName(encoding);
171171
String canonName = cs.getCanonName();
172-
if (!cs.isAsciiSuperset()) {
173-
err("The encoding \u201C"
174-
+ encoding
175-
+ "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
176-
return false;
177-
}
178172
if (!cs.isRegistered()) {
179173
if (encoding.startsWith("x-")) {
180174
err("The encoding \u201C"

0 commit comments

Comments
 (0)