Prevent the common zero-width code points and detect invalid UTF-8 encoding in our sources and selected resource files (#12937)

uschindler · uschindler · commit 1a4c85310c3e · 2023-12-13T17:28:21.000+01:00
* Simple patch to prevent the common zero-width code points in our source and some types of resource files

* Validate correct UTF-8 input and fix buggy CSS file (ISO-8859-x encoded)

* add a bit of context

* Add CHANGES.txt
diff --git a/gradle/validation/validate-source-patterns.gradle b/gradle/validation/validate-source-patterns.gradle
@@ -15,6 +15,10 @@
  * limitations under the License.
  */
 
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+
 import org.apache.rat.Defaults
 import org.apache.rat.document.impl.FileDocument
 import org.apache.rat.api.MetaData
@@ -144,8 +148,8 @@ class ValidateSourcePatternsTask extends DefaultTask {
       (~$/\$$Id\b/$) : 'svn keyword',
       (~$/\$$Header\b/$) : 'svn keyword',
       (~$/\$$Source\b/$) : 'svn keyword',
-      (~$/^\uFEFF/$) : 'UTF-8 byte order mark',
-      (~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary'
+      (~$/[\u200B\uFEFF]/$) : 'UTF-8 byte order mark or other zero-width codepoints',
+      (~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary',
     ]
 
     // Python and others merrily use var declarations, this is a problem _only_ in Java at least for 8x where we're forbidding var declarations
@@ -198,15 +202,29 @@ class ValidateSourcePatternsTask extends DefaultTask {
     ProgressLogger progress = progressLoggerFactory.newOperation(this.class)
     progress.start(this.name, this.name)
 
+    def validatingDecoder = StandardCharsets.UTF_8.newDecoder()
+      .onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT)
+
     sourceFiles.each { f ->
       try {
         progress.progress("Scanning ${f.name}")
         logger.debug('Scanning source file: {}', f);
 
-        def text = f.getText('UTF-8');
+        String text
+        try {
+          validatingDecoder.reset()
+          text = f.withInputStream {
+            in -> new InputStreamReader(in, validatingDecoder).getText()
+          }
+        } catch (CharacterCodingException e) {
+          reportViolation(f, "incorrect UTF-8 encoding [${e}]")
+          return // we can't proceed for this file
+        }
+
         invalidPatterns.each { pattern, name ->
-          if (pattern.matcher(text).find()) {
-            reportViolation(f, name);
+          def matcher = pattern.matcher(text);
+          if (matcher.find()) {
+            reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end()));
           }
         }
         def javadocsMatcher = javadocsPattern.matcher(text);
@@ -230,9 +248,10 @@ class ValidateSourcePatternsTask extends DefaultTask {
           }
           checkLicenseHeaderPrecedes(f, 'package', packagePattern, javaCommentPattern, text, ratDocument);
 
-          invalidJavaOnlyPatterns.each { pattern,name ->
-            if (pattern.matcher(text).find()) {
-              reportViolation(f, name);
+          invalidJavaOnlyPatterns.each { pattern, name ->
+            def matcher = pattern.matcher(text);
+            if (matcher.find()) {
+              reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end()));
             }
           }
         }
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -45,6 +45,12 @@ Bug Fixes
 * GITHUB#12920: Address bug in TestDrillSideways#testCollectionTerminated that could occasionally cause the test to
   fail with certain random seeds. (Greg Miller)
 
+Build
+---------------------
+
+* GITHUB#12931, GITHUB#12936, GITHUB#12937: Improve source file validation to detect incorrect
+  UTF-8 sequences and forbid U+200B; enable errorprone DisableUnicodeInCode check.  (Robert Muir, Uwe Schindler)
+
 Other
 ---------------------
 * GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
diff --git a/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json b/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json
@@ -2,5 +2,5 @@
     "gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
     "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "346eb94735de07f845269e8f042ed0a25f09bc7f",
     "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "971a80479af08c54fd2fab9a75bb2321904cb3ef",
-    "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf"
+    "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "8d7cd1a935443deda6cad73b91f1a45c1c714535"
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex
@@ -334,7 +334,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
   //
   //     In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
   //
-  //         WB3c′ ZWJ × ​(Extended_Pictographic | EmojiNRK)
+  //         WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
   //
     {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} ) 
   | {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx} 
diff --git a/lucene/core/src/generated/checksums/generateStandardTokenizer.json b/lucene/core/src/generated/checksums/generateStandardTokenizer.json
@@ -1,5 +1,5 @@
 {
     "gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
     "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",
-    "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99"
+    "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "2db77654bcef8add1523d9e6260a3f72a3c58ed1"
 }
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@@ -200,7 +200,7 @@ ComplexContextEx    = \p{LB:Complex_Context}
 //
 //     In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
 //
-//         WB3c′ ZWJ × ​(Extended_Pictographic | EmojiNRK)
+//         WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
 //
   {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} ) 
 | {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx} 
diff --git a/lucene/queryparser/docs/xml/cctree.js b/lucene/queryparser/docs/xml/cctree.js
@@ -1,16 +1,16 @@
 /* This code is based on the one originally provided by
-   Geir Landr� in his dTree 2.05 package. You can get it
+   Geir Landrö in his dTree 2.05 package. You can get it
    at : www.destroydrop.com/javascript/tree/.
    
    Therefore, the DTDDoc team considers that this code is 
-   Copyright (c) 2002-2003 Geir Landr�. Since the original
+   Copyright (c) 2002-2003 Geir Landrö. Since the original
    author didn't clearly forbids copies of this part, we
    assume we're not doing anything wrong in porviding it
    to you, in a modified or non-modified form.
 */
 
 /*   
-   Geir Landr� : Orignal version, for dTree.
+   Geir Landrö : Orignal version, for dTree.
    
    Michael Koehrsen (10/2004) : Original modification to
       allow DTDDoc to use this.
diff --git a/lucene/queryparser/docs/xml/dtreeStyle.css b/lucene/queryparser/docs/xml/dtreeStyle.css
@@ -1,16 +1,16 @@
 /* This CSS is based on the one originally provided by
-   Geir Landr� in his dTree 2.05 package. You can get it
+   Geir Landrö in his dTree 2.05 package. You can get it
    at : www.destroydrop.com/javascript/tree/.
    
    Therefore, the DTDDoc team considers that this code is 
-   Copyright (c) 2002-2003 Geir Landr�. Since the original
+   Copyright (c) 2002-2003 Geir Landrö. Since the original
    author didn't clearly forbids copies of this part, we
    assume we're not doing anything wrong in porviding it
    to you, in a modified or non-modified form.
 */
 
 /*   
-   Geir Landr� : Orignal version, for dTree.
+   Geir Landrö : Orignal version, for dTree.
    
    Stefan Champailler (10/2004) : Style changes here and
       there.

Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,5 @@`
`2`	`2`	`"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",`
`3`	`3`	`"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "346eb94735de07f845269e8f042ed0a25f09bc7f",`
`4`	`4`	`"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "971a80479af08c54fd2fab9a75bb2321904cb3ef",`
`5`		`- "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf"`
	`5`	`+ "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "8d7cd1a935443deda6cad73b91f1a45c1c714535"`
`6`	`6`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`	`2`	`"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",`
`3`	`3`	`"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",`
`4`		`- "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99"`
	`4`	`+ "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "2db77654bcef8add1523d9e6260a3f72a3c58ed1"`
`5`	`5`	`}`