Skip to content

Commit 1a4c853

Browse files
committed
Prevent the common zero-width code points and detect invalid UTF-8 encoding in our sources and selected resource files (#12937)
* Simple patch to prevent the common zero-width code points in our source and some types of resource files * Validate correct UTF-8 input and fix buggy CSS file (ISO-8859-x encoded) * add a bit of context * Add CHANGES.txt
1 parent 5643712 commit 1a4c853

File tree

8 files changed

+43
-18
lines changed

8 files changed

+43
-18
lines changed

gradle/validation/validate-source-patterns.gradle

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
* limitations under the License.
1616
*/
1717

18+
import java.nio.charset.CharacterCodingException;
19+
import java.nio.charset.CodingErrorAction;
20+
import java.nio.charset.StandardCharsets;
21+
1822
import org.apache.rat.Defaults
1923
import org.apache.rat.document.impl.FileDocument
2024
import org.apache.rat.api.MetaData
@@ -144,8 +148,8 @@ class ValidateSourcePatternsTask extends DefaultTask {
144148
(~$/\$$Id\b/$) : 'svn keyword',
145149
(~$/\$$Header\b/$) : 'svn keyword',
146150
(~$/\$$Source\b/$) : 'svn keyword',
147-
(~$/^\uFEFF/$) : 'UTF-8 byte order mark',
148-
(~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary'
151+
(~$/[\u200B\uFEFF]/$) : 'UTF-8 byte order mark or other zero-width codepoints',
152+
(~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary',
149153
]
150154

151155
// Python and others merrily use var declarations, this is a problem _only_ in Java at least for 8x where we're forbidding var declarations
@@ -198,15 +202,29 @@ class ValidateSourcePatternsTask extends DefaultTask {
198202
ProgressLogger progress = progressLoggerFactory.newOperation(this.class)
199203
progress.start(this.name, this.name)
200204

205+
def validatingDecoder = StandardCharsets.UTF_8.newDecoder()
206+
.onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT)
207+
201208
sourceFiles.each { f ->
202209
try {
203210
progress.progress("Scanning ${f.name}")
204211
logger.debug('Scanning source file: {}', f);
205212

206-
def text = f.getText('UTF-8');
213+
String text
214+
try {
215+
validatingDecoder.reset()
216+
text = f.withInputStream {
217+
in -> new InputStreamReader(in, validatingDecoder).getText()
218+
}
219+
} catch (CharacterCodingException e) {
220+
reportViolation(f, "incorrect UTF-8 encoding [${e}]")
221+
return // we can't proceed for this file
222+
}
223+
207224
invalidPatterns.each { pattern, name ->
208-
if (pattern.matcher(text).find()) {
209-
reportViolation(f, name);
225+
def matcher = pattern.matcher(text);
226+
if (matcher.find()) {
227+
reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end()));
210228
}
211229
}
212230
def javadocsMatcher = javadocsPattern.matcher(text);
@@ -230,9 +248,10 @@ class ValidateSourcePatternsTask extends DefaultTask {
230248
}
231249
checkLicenseHeaderPrecedes(f, 'package', packagePattern, javaCommentPattern, text, ratDocument);
232250

233-
invalidJavaOnlyPatterns.each { pattern,name ->
234-
if (pattern.matcher(text).find()) {
235-
reportViolation(f, name);
251+
invalidJavaOnlyPatterns.each { pattern, name ->
252+
def matcher = pattern.matcher(text);
253+
if (matcher.find()) {
254+
reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end()));
236255
}
237256
}
238257
}

lucene/CHANGES.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ Bug Fixes
4545
* GITHUB#12920: Address bug in TestDrillSideways#testCollectionTerminated that could occasionally cause the test to
4646
fail with certain random seeds. (Greg Miller)
4747

48+
Build
49+
---------------------
50+
51+
* GITHUB#12931, GITHUB#12936, GITHUB#12937: Improve source file validation to detect incorrect
52+
UTF-8 sequences and forbid U+200B; enable errorprone DisableUnicodeInCode check. (Robert Muir, Uwe Schindler)
53+
4854
Other
4955
---------------------
5056
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)

lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
33
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "346eb94735de07f845269e8f042ed0a25f09bc7f",
44
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "971a80479af08c54fd2fab9a75bb2321904cb3ef",
5-
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf"
5+
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "8d7cd1a935443deda6cad73b91f1a45c1c714535"
66
}

lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
334334
//
335335
// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
336336
//
337-
// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
337+
// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
338338
//
339339
{EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
340340
| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
33
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",
4-
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99"
4+
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "2db77654bcef8add1523d9e6260a3f72a3c58ed1"
55
}

lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ ComplexContextEx = \p{LB:Complex_Context}
200200
//
201201
// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
202202
//
203-
// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
203+
// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
204204
//
205205
{EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
206206
| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}

lucene/queryparser/docs/xml/cctree.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
/* This code is based on the one originally provided by
2-
Geir Landrö in his dTree 2.05 package. You can get it
2+
Geir Landrö in his dTree 2.05 package. You can get it
33
at : www.destroydrop.com/javascript/tree/.
44
55
Therefore, the DTDDoc team considers that this code is
6-
Copyright (c) 2002-2003 Geir Landrö. Since the original
6+
Copyright (c) 2002-2003 Geir Landrö. Since the original
77
author didn't clearly forbids copies of this part, we
88
assume we're not doing anything wrong in porviding it
99
to you, in a modified or non-modified form.
1010
*/
1111

1212
/*
13-
Geir Landrö : Orignal version, for dTree.
13+
Geir Landrö : Orignal version, for dTree.
1414
1515
Michael Koehrsen (10/2004) : Original modification to
1616
allow DTDDoc to use this.

lucene/queryparser/docs/xml/dtreeStyle.css

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
/* This CSS is based on the one originally provided by
2-
Geir Landrö in his dTree 2.05 package. You can get it
2+
Geir Landrö in his dTree 2.05 package. You can get it
33
at : www.destroydrop.com/javascript/tree/.
44
55
Therefore, the DTDDoc team considers that this code is
6-
Copyright (c) 2002-2003 Geir Landrö. Since the original
6+
Copyright (c) 2002-2003 Geir Landrö. Since the original
77
author didn't clearly forbids copies of this part, we
88
assume we're not doing anything wrong in porviding it
99
to you, in a modified or non-modified form.
1010
*/
1111

1212
/*
13-
Geir Landrö : Orignal version, for dTree.
13+
Geir Landrö : Orignal version, for dTree.
1414
1515
Stefan Champailler (10/2004) : Style changes here and
1616
there.

0 commit comments

Comments
 (0)