|
25 | 25 | import java.nio.charset.StandardCharsets; |
26 | 26 | import java.nio.file.Files; |
27 | 27 | import java.nio.file.Path; |
28 | | -import java.text.ParsePosition; |
29 | 28 | import java.util.Comparator; |
30 | 29 | import java.util.EnumMap; |
31 | 30 | import java.util.EnumSet; |
@@ -194,17 +193,17 @@ private LinkTermination(String uset) { |
194 | 193 | } |
195 | 194 | } |
196 | 195 |
|
197 | | - // Note: the source standards are painful to read. |
198 | | - // https://en.wikipedia.org/wiki/Email_address#Local-part is much easier |
| 196 | + // https://datatracker.ietf.org/doc/html/rfc5322#section-3.2.3 has the full list for ASCII part |
| 197 | + // See also https://en.wikipedia.org/wiki/Email_address#Local-part |
| 198 | + // We add dot (ascii '.'), and then check after for the special dot constraints. |
199 | 199 |
|
200 | | - static final UnicodeSet EMAIL_EXCLUDES = |
201 | | - new UnicodeSet("[\\u0020 ; \\: \" ( ) \\[ \\] @ \\\\ < >]").freeze(); |
| 200 | + static final UnicodeSet EMAIL_ASCII_INCLUDES = |
| 201 | + new UnicodeSet("[[a-zA-Z][0-9][_ \\- ! ? ' \\{ \\} * / \\& # % ` \\^ + = | ~ \\$]]") |
| 202 | + .add('.') |
| 203 | + .freeze(); |
202 | 204 | static final UnicodeSet validEmailLocalPart = |
203 | | - new UnicodeSet( |
204 | | - "[\\p{XID_Continue}\\p{block=basic_latin}-\\p{Cc}]", |
205 | | - new ParsePosition(0), |
206 | | - VersionedSymbolTable.frozenAt(UNICODE_VERSION)) |
207 | | - .removeAll(EMAIL_EXCLUDES) |
| 205 | + new UnicodeSet("[\\p{XID_Continue}-\\p{block=basic_latin}]") |
| 206 | + .addAll(EMAIL_ASCII_INCLUDES) |
208 | 207 | .freeze(); |
209 | 208 | public static final UnicodeProperty LinkEmail = |
210 | 209 | new UnicodeSetProperty() |
|
0 commit comments