Skip to content

Commit 1569cdd

Browse files
committed
adjust unknown pattern
1 parent c0a7bc7 commit 1569cdd

File tree

1 file changed

+2
-1
lines changed

1 file changed

+2
-1
lines changed

src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@ class TagTextCore {
3737

3838
public static final Pattern PUNCT_PATTERN = Pattern.compile(/[,.:;!?\/()\[\]{}«»„“"'…\u2013\u2014\u201D\u201C•■♦-]+/) // "
3939
public static final Pattern SYMBOL_PATTERN = Pattern.compile(/[%&@$*+=<>\u00A0-\u00BF\u2000-\u20CF\u2100-\u218F\u2200-\u22FF]+/)
40+
// |[а-яіїєґА-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'\u02BC\u2019]*[а-яіїєґА-ЯІЇЄҐ]-
4041
static final Pattern UNKNOWN_PATTERN = Pattern.compile(/(.*-)?[а-яіїєґА-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'\u02BC\u2019]+(-.*)?/)
41-
static final Pattern NON_UK_PATTERN = Pattern.compile(/^[\#№u2013-]|[\u2013-]$|[ыэъё]|[а-яіїєґ][a-z]|[a-z][а-яіїєґ]/, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE)
42+
static final Pattern NON_UK_PATTERN = Pattern.compile(/^[\#№u2013-]|[ыэъё]|[а-яіїєґ][a-z]|[a-z][а-яіїєґ]/, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE)
4243
static final Pattern UNCLASS_PATTERN = Pattern.compile(/\p{IsLatin}[\p{IsLatin}\p{IsDigit}-]*|[0-9]+-?[а-яіїєґА-ЯІЇЄҐ]+|[а-яіїєґА-ЯІЇЄҐ]+-?[0-9]+/)
4344
static final Pattern NONINFL_PATTERN = Pattern.compile(/[а-зй-яіїєґ]/, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE) // exclude Russian и
4445
public static final Pattern XML_TAG_PATTERN = Pattern.compile(/<\/?[a-zA-Z_0-9]+>/)

0 commit comments

Comments
 (0)