File tree Expand file tree Collapse file tree 7 files changed +21
-16
lines changed
src/main/java/org/unicode/utilities Expand file tree Collapse file tree 7 files changed +21
-16
lines changed Original file line number Diff line number Diff line change 11# LinkBracket.txt
2- # Date: 2025-12-20, 21:02:29 GMT
2+ # Date: 2025-12-26, 00:24:58 GMT
33# © 2025 Unicode®, Inc.
44# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55# For terms of use and license, see https://www.unicode.org/terms_of_use.html
Original file line number Diff line number Diff line change 11# LinkDetectionTest.txt
2- # Date: 2025-12-20, 21:02:29 GMT
2+ # Date: 2025-12-26, 00:24:58 GMT
33# © 2025 Unicode®, Inc.
44# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55# For terms of use and license, see https://www.unicode.org/terms_of_use.html
Original file line number Diff line number Diff line change 11# LinkEmail.txt
2- # Date: 2025-12-20, 21:02:29 GMT
2+ # Date: 2025-12-26, 00:24:58 GMT
33# © 2025 Unicode®, Inc.
44# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55# For terms of use and license, see https://www.unicode.org/terms_of_use.html
2626#
27270021 # 1.1 (!) EXCLAMATION MARK
28280023..0027 # 1.1 [5] (#..') NUMBER SIGN..APOSTROPHE
29- 002A..0039 # 1.1 [16] (*..9) ASTERISK..DIGIT NINE
29+ 002A..002B # 1.1 [2] (*..+) ASTERISK..PLUS SIGN
30+ 002D..0039 # 1.1 [13] (-..9) HYPHEN-MINUS..DIGIT NINE
3031003D # 1.1 (=) EQUALS SIGN
3132003F # 1.1 (?) QUESTION MARK
32330041..005A # 1.1 [26] (A..Z) LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
@@ -1292,4 +1293,4 @@ FFDA..FFDC # 1.1 [3] (ᅳ..ᅵ) HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
12921293323B0..33479 # 17.0 [4298] (..) CJK UNIFIED IDEOGRAPH-323B0..CJK UNIFIED IDEOGRAPH-33479
12931294E0100..E01EF # 4.0 [240] (U+E0100..U+E01EF) VARIATION SELECTOR-17..VARIATION SELECTOR-256
12941295
1295- # Total code points: 149241
1296+ # Total code points: 149240
Original file line number Diff line number Diff line change 11# LinkFormattingTest.txt
2- # Date: 2025-12-20, 21:02:29 GMT
2+ # Date: 2025-12-26, 00:24:58 GMT
33# © 2025 Unicode®, Inc.
44# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55# For terms of use and license, see https://www.unicode.org/terms_of_use.html
Original file line number Diff line number Diff line change 11# LinkTerm.txt
2- # Date: 2025-12-20, 21:02:29 GMT
2+ # Date: 2025-12-26, 00:24:58 GMT
33# © 2025 Unicode®, Inc.
44# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55# For terms of use and license, see https://www.unicode.org/terms_of_use.html
Original file line number Diff line number Diff line change 11# LinkEmail.txt
2- # Date: 2025-12-24, 02:37:15 GMT
2+ # Date: 2025-12-26, 00:32:54 GMT
33# © 2025 Unicode®, Inc.
44# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55# For terms of use and license, see https://www.unicode.org/terms_of_use.html
2626#
27270021 # 1.1 (!) EXCLAMATION MARK
28280023..0027 # 1.1 [5] (#..') NUMBER SIGN..APOSTROPHE
29- 002A..0039 # 1.1 [16] (*..9) ASTERISK..DIGIT NINE
29+ 002A..002B # 1.1 [2] (*..+) ASTERISK..PLUS SIGN
30+ 002D..0039 # 1.1 [13] (-..9) HYPHEN-MINUS..DIGIT NINE
3031003D # 1.1 (=) EQUALS SIGN
3132003F # 1.1 (?) QUESTION MARK
32330041..005A # 1.1 [26] (A..Z) LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
@@ -1331,4 +1332,4 @@ FFDA..FFDC # 1.1 [3] (ᅳ..ᅵ) HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
133113323D000..3FC3F # 18.0 [11328] (U+3D000..U+3FC3F) SEAL CHARACTER-3D000..SEAL CHARACTER-3FC3F
13321333E0100..E01EF # 4.0 [240] (U+E0100..U+E01EF) VARIATION SELECTOR-17..VARIATION SELECTOR-256
13331334
1334- # Total code points: 162119
1335+ # Total code points: 162118
Original file line number Diff line number Diff line change @@ -194,17 +194,20 @@ private LinkTermination(String uset) {
194194 }
195195 }
196196
197- // Note: the source standards are painful to read.
198- // https://en.wikipedia.org/wiki/Email_address#Local-part is much easier
197+ // https://www.rfc-editor.org/rfc/rfc5322.html#section-3.2.3 has the full list for ASCII part
198+ // See also https://en.wikipedia.org/wiki/Email_address#Local-part
199+ // We add dot (ascii '.'), and then check after for the special dot constraints.
199200
200- static final UnicodeSet EMAIL_EXCLUDES =
201- new UnicodeSet ("[\\ u0020 ; \\ : \" ( ) \\ [ \\ ] @ \\ \\ < >]" ).freeze ();
201+ static final UnicodeSet EMAIL_ASCII_INCLUDES =
202+ new UnicodeSet ("[[a-zA-Z][0-9][_ \\ - ! ? ' \\ { \\ } * / \\ & # % ` \\ ^ + = | ~ \\ $]]" )
203+ .add ('.' )
204+ .freeze ();
202205 static final UnicodeSet validEmailLocalPart =
203206 new UnicodeSet (
204- "[\\ p{XID_Continue}\\ p{block=basic_latin}- \\ p{Cc }]" ,
207+ "[\\ p{XID_Continue}- \\ p{block=basic_latin}]" ,
205208 new ParsePosition (0 ),
206209 VersionedSymbolTable .frozenAt (UNICODE_VERSION ))
207- .removeAll ( EMAIL_EXCLUDES )
210+ .addAll ( EMAIL_ASCII_INCLUDES )
208211 .freeze ();
209212 public static final UnicodeProperty LinkEmail =
210213 new UnicodeSetProperty ()
You can’t perform that action at this time.
0 commit comments