Skip to content

Commit 3a034d4

Browse files
committed
Add changes for email property
1 parent 6df8f96 commit 3a034d4

File tree

1 file changed

+9
-10
lines changed

1 file changed

+9
-10
lines changed

unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import java.nio.charset.StandardCharsets;
2626
import java.nio.file.Files;
2727
import java.nio.file.Path;
28-
import java.text.ParsePosition;
2928
import java.util.Comparator;
3029
import java.util.EnumMap;
3130
import java.util.EnumSet;
@@ -194,17 +193,17 @@ private LinkTermination(String uset) {
194193
}
195194
}
196195

197-
// Note: the source standards are painful to read.
198-
// https://en.wikipedia.org/wiki/Email_address#Local-part is much easier
196+
// https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 has the full list for ASCII part
197+
// See also https://en.wikipedia.org/wiki/Email_address#Local-part
198+
// We add dot (ascii '.'), and then check after for the special dot constraints.
199199

200-
static final UnicodeSet EMAIL_EXCLUDES =
201-
new UnicodeSet("[\\u0020 ; \\: \" ( ) \\[ \\] @ \\\\ < >]").freeze();
200+
static final UnicodeSet EMAIL_ASCII_INCLUDES =
201+
new UnicodeSet("[[a-zA-Z][0-9][_ \\- ! ? ' \\{ \\} * / \\& # % ` \\^ + = | ~ \\$]]")
202+
.add('.')
203+
.freeze();
202204
static final UnicodeSet validEmailLocalPart =
203-
new UnicodeSet(
204-
"[\\p{XID_Continue}\\p{block=basic_latin}-\\p{Cc}]",
205-
new ParsePosition(0),
206-
VersionedSymbolTable.frozenAt(UNICODE_VERSION))
207-
.removeAll(EMAIL_EXCLUDES)
205+
new UnicodeSet("[\\p{XID_Continue}-\\p{block=basic_latin}]")
206+
.addAll(EMAIL_ASCII_INCLUDES)
208207
.freeze();
209208
public static final UnicodeProperty LinkEmail =
210209
new UnicodeSetProperty()

0 commit comments

Comments
 (0)