Skip to content

Commit e6a332b

Browse files
macchiatimarkusicu
andauthored
Fix comma in link email property (#1269)
* Add changes for email property * Fixed data files * Update code for prop change, regen dev file. * Update unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java Co-authored-by: Markus Scherer <[email protected]> --------- Co-authored-by: Markus Scherer <[email protected]>
1 parent 6df8f96 commit e6a332b

File tree

7 files changed

+21
-16
lines changed

7 files changed

+21
-16
lines changed

unicodetools/data/linkification/17.0.0/LinkBracket.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkBracket.txt
2-
# Date: 2025-12-20, 21:02:29 GMT
2+
# Date: 2025-12-26, 00:24:58 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html

unicodetools/data/linkification/17.0.0/LinkDetectionTest.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkDetectionTest.txt
2-
# Date: 2025-12-20, 21:02:29 GMT
2+
# Date: 2025-12-26, 00:24:58 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html

unicodetools/data/linkification/17.0.0/LinkEmail.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkEmail.txt
2-
# Date: 2025-12-20, 21:02:29 GMT
2+
# Date: 2025-12-26, 00:24:58 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -26,7 +26,8 @@
2626
#
2727
0021 # 1.1 (!) EXCLAMATION MARK
2828
0023..0027 # 1.1 [5] (#..') NUMBER SIGN..APOSTROPHE
29-
002A..0039 # 1.1 [16] (*..9) ASTERISK..DIGIT NINE
29+
002A..002B # 1.1 [2] (*..+) ASTERISK..PLUS SIGN
30+
002D..0039 # 1.1 [13] (-..9) HYPHEN-MINUS..DIGIT NINE
3031
003D # 1.1 (=) EQUALS SIGN
3132
003F # 1.1 (?) QUESTION MARK
3233
0041..005A # 1.1 [26] (A..Z) LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
@@ -1292,4 +1293,4 @@ FFDA..FFDC # 1.1 [3] (ᅳ..ᅵ) HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
12921293
323B0..33479 # 17.0 [4298] (𲎰..𳑹) CJK UNIFIED IDEOGRAPH-323B0..CJK UNIFIED IDEOGRAPH-33479
12931294
E0100..E01EF # 4.0 [240] (U+E0100..U+E01EF) VARIATION SELECTOR-17..VARIATION SELECTOR-256
12941295

1295-
# Total code points: 149241
1296+
# Total code points: 149240

unicodetools/data/linkification/17.0.0/LinkFormattingTest.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkFormattingTest.txt
2-
# Date: 2025-12-20, 21:02:29 GMT
2+
# Date: 2025-12-26, 00:24:58 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html

unicodetools/data/linkification/17.0.0/LinkTerm.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkTerm.txt
2-
# Date: 2025-12-20, 21:02:29 GMT
2+
# Date: 2025-12-26, 00:24:58 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html

unicodetools/data/linkification/dev/LinkEmail.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkEmail.txt
2-
# Date: 2025-12-24, 02:37:15 GMT
2+
# Date: 2025-12-26, 00:32:54 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -26,7 +26,8 @@
2626
#
2727
0021 # 1.1 (!) EXCLAMATION MARK
2828
0023..0027 # 1.1 [5] (#..') NUMBER SIGN..APOSTROPHE
29-
002A..0039 # 1.1 [16] (*..9) ASTERISK..DIGIT NINE
29+
002A..002B # 1.1 [2] (*..+) ASTERISK..PLUS SIGN
30+
002D..0039 # 1.1 [13] (-..9) HYPHEN-MINUS..DIGIT NINE
3031
003D # 1.1 (=) EQUALS SIGN
3132
003F # 1.1 (?) QUESTION MARK
3233
0041..005A # 1.1 [26] (A..Z) LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
@@ -1331,4 +1332,4 @@ FFDA..FFDC # 1.1 [3] (ᅳ..ᅵ) HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
13311332
3D000..3FC3F # 18.0 [11328] (U+3D000..U+3FC3F) SEAL CHARACTER-3D000..SEAL CHARACTER-3FC3F
13321333
E0100..E01EF # 4.0 [240] (U+E0100..U+E01EF) VARIATION SELECTOR-17..VARIATION SELECTOR-256
13331334

1334-
# Total code points: 162119
1335+
# Total code points: 162118

unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -194,17 +194,20 @@ private LinkTermination(String uset) {
194194
}
195195
}
196196

197-
// Note: the source standards are painful to read.
198-
// https://en.wikipedia.org/wiki/Email_address#Local-part is much easier
197+
// https://www.rfc-editor.org/rfc/rfc5322.html#section-3.2.3 has the full list for ASCII part
198+
// See also https://en.wikipedia.org/wiki/Email_address#Local-part
199+
// We add dot (ascii '.'), and then check after for the special dot constraints.
199200

200-
static final UnicodeSet EMAIL_EXCLUDES =
201-
new UnicodeSet("[\\u0020 ; \\: \" ( ) \\[ \\] @ \\\\ < >]").freeze();
201+
static final UnicodeSet EMAIL_ASCII_INCLUDES =
202+
new UnicodeSet("[[a-zA-Z][0-9][_ \\- ! ? ' \\{ \\} * / \\& # % ` \\^ + = | ~ \\$]]")
203+
.add('.')
204+
.freeze();
202205
static final UnicodeSet validEmailLocalPart =
203206
new UnicodeSet(
204-
"[\\p{XID_Continue}\\p{block=basic_latin}-\\p{Cc}]",
207+
"[\\p{XID_Continue}-\\p{block=basic_latin}]",
205208
new ParsePosition(0),
206209
VersionedSymbolTable.frozenAt(UNICODE_VERSION))
207-
.removeAll(EMAIL_EXCLUDES)
210+
.addAll(EMAIL_ASCII_INCLUDES)
208211
.freeze();
209212
public static final UnicodeProperty LinkEmail =
210213
new UnicodeSetProperty()

0 commit comments

Comments
 (0)