bugfix: char class casefold for certain chars

haozhun · haozhun · commit 13fe106cd027 · 2015-03-20T12:01:05.000-07:00
When a character is less than or equal to single byte size (0xff),
yet it takes more than 1 byte in the current encoding, the
case folding code incorrectly put it in bitset instead of code
range. As a result, for utf8 encoding, casefold works incorrectly
on characters in range \u0080 to \u00ff (latin1 supplement).

Before fix:

* `"\u00c2"` `[\u00e0-\u00e5]` returns false
* `"\u00c2"` `[\u00e2]` returns false
* `"\u00c2"` `\u00e2` returns true
diff --git a/src/org/joni/ApplyCaseFold.java b/src/org/joni/ApplyCaseFold.java
@@ -41,7 +41,7 @@ public void apply(int from, int[]to, int length, Object o) {
 
             if (Config.CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS) {
                 if ((inCC && !cc.isNot()) || (!inCC && cc.isNot())) {
-                    if (enc.minLength() > 1 || to[0] >= BitSet.SINGLE_BYTE_SIZE) {
+                    if (enc.minLength() > 1 || to[0] >= BitSet.SINGLE_BYTE_SIZE || enc.codeToMbcLength(to[0]) > 1) {
                         cc.addCodeRange(env, to[0], to[0]);
                     } else {
                         /* /(?i:[^A-C])/.match("a") ==> fail. */