Skip to content

Commit 5f9f1ee

Browse files
committed
Latin/Cyrillic fix adjustments
make this cleanup less aggressive to have less FP
1 parent 2435eaa commit 5f9f1ee

File tree

2 files changed

+68
-60
lines changed

2 files changed

+68
-60
lines changed

src/main/groovy/ua/net/nlp/other/clean/LatCyrModule.groovy

Lines changed: 58 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ class LatCyrModule {
8383
for(int ii=0; ii<10; ii++) {
8484
cont = false
8585

86-
def m1 = t0 =~ /([XVI])([ХІ])/
86+
// exclude IІвана
87+
def m1 = t0 =~ /([XVI])([ХІ])(?![а-яіїєґa-z])/
8788

8889
if( m1 ) {
8990
cont = true
@@ -96,7 +97,7 @@ class LatCyrModule {
9697
} } )
9798
}
9899

99-
def m2 = t0 =~ /([ХІ])([XVI])/
100+
def m2 = t0 =~ /([ХІ])([XVI])(?![а-яіїєґa-z])/
100101
if( m2 ) {
101102
cont = true
102103
// t0 = null // ml
@@ -110,59 +111,59 @@ class LatCyrModule {
110111
t0
111112
}
112113

113-
@CompileStatic
114-
String fixReliableCyr(String text, int[] counts) {
115-
// exclusively cyrillic letter followed by latin looking like cyrillic
116-
// def t1 = text.replaceAll(/([бвгґдєжзийклмнптфцчшщьюяБГҐДЄЖЗИЙЛПФХЦЧШЩЬЮЯ]['’ʼ]?)([aceiopxyABCEHIKMOPTXYáÁéÉíÍḯḮóÓúýÝ])/, { all, cyr, lat ->
117-
def m1 = text =~ /([бвгґдєжзийклмнптфцчшщьюяѣБГҐДЄЖЗИЙЛПФХЦЧШЩЬЮЯ]['’ʼ]?)([aceiopxyABCEHIKMOPTXYáÁéÉíÍḯḮóÓúýÝ])/
118-
def t1 = m1.replaceAll( new Function<MatchResult, String>() { String apply(MatchResult mr) { // { mr -> // all, cyr, lat
119-
def cyr = mr.group(1)
120-
def lat = mr.group(2)
121-
out.debug "mix: 1.1"
122-
counts[0] += 1
123-
cyr.concat(latToCyrMap[lat])
124-
} } )
125-
126-
// exclusively cyrillic letter preceeded by latin looking like cyrillic
127-
128-
// text.replaceAll(/([aceiopxyABCEHIKMOPTXYáÁéÉíÍḯḮóÓúýÝ])(['’ʼ]?[бвгґдєжзийклмнптфцчшщьюяБГҐДЄЖЗИЙЛПФХЦЧШЩЬЮЯ])/, { all, lat, cyr ->
129-
def m2 = t1 =~ /([aceiopxyABCEHIKMOPTXYáÁéÉíÍḯḮóÓúýÝ])(['’ʼ]?[бвгґдєжзийклмнптфцчшщьюяѣБГҐДЄЖЗИЙЛПФХЦЧШЩЬЮЯ])/
130-
// t1 = null // ml
131-
def t2 = m2.replaceAll( new Function<MatchResult, String>() { String apply(MatchResult mr) { // { mr -> // lat, cyr
132-
def lat = mr.group(1)
133-
def cyr = mr.group(2)
134-
out.debug "mix: 1.2"
135-
counts[0] += 1
136-
assert cyr
137-
latToCyrMap[lat].concat(cyr)
138-
} } )
139-
}
140-
141-
@CompileStatic
142-
String fixReliableLat(String text, int[] counts) {
143-
// def t1 = text.replaceAll(/([bdfghjklmnrstuvwzDFGJLNQRSUVWZ]['’ʼ]?)([асеіорхуАВСЕНІКМНОРТХУ])/, { all, lat, cyr ->
144-
def m1 = text =~ /([bdfghjklmnrstuvwzDFGJLNQRSUVWZ]['’ʼ]?)([асеіорхуАВСЕНІКМНОРТХУ])/
145-
def t1 = m1.replaceAll( new Function<MatchResult, String>() { String apply(MatchResult mr) {
146-
def lat = mr.group(1)
147-
def cyr = mr.group(2)
148-
out.debug "mix: 1.3"
149-
counts[1] += 2
150-
assert cyrToLatMap[cyr]
151-
lat.concat(cyrToLatMap[cyr])
152-
} } )
153-
154-
// def t2 = t1.replaceAll(/([асеіорхуАВСЕНІКМНОРТХУ])(['’ʼ]?[bdfghjklmnrstuvwzDFGJLNQRSUVWZ])/, { all, cyr, lat ->
155-
def m2 = t1 =~ /([асеіорхуАВСЕНІКМНОРТХУ])(['’ʼ]?[bdfgjklmnrstuvwzDFGJLNQRSUVWZ])/ // h is often == ѣ
156-
// t1 = null // ml
157-
m2.replaceAll( new Function<MatchResult, String>() { String apply(MatchResult mr) {
158-
def cyr = mr.group(1)
159-
def lat = mr.group(2)
160-
out.debug "mix: 1.4"
161-
counts[1] += 2
162-
assert lat
163-
cyrToLatMap[cyr].concat(lat)
164-
} } )
165-
}
114+
// @CompileStatic
115+
// String fixReliableCyr(String text, int[] counts) {
116+
// // exclusively cyrillic letter followed by latin looking like cyrillic
117+
//// def t1 = text.replaceAll(/([бвгґдєжзийклмнптфцчшщьюяБГҐДЄЖЗИЙЛПФХЦЧШЩЬЮЯ]['’ʼ]?)([aceiopxyABCEHIKMOPTXYáÁéÉíÍḯḮóÓúýÝ])/, { all, cyr, lat ->
118+
// def m1 = text =~ /([бвгґдєжзийклмнптфцчшщьюяѣБГҐДЄЖЗИЙЛПФХЦЧШЩЬЮЯ]['’ʼ]?)([aceiopxyABCEHIKMOPTXYáÁéÉíÍḯḮóÓúýÝ])/
119+
// def t1 = m1.replaceAll( new Function<MatchResult, String>() { String apply(MatchResult mr) { // { mr -> // all, cyr, lat
120+
// def cyr = mr.group(1)
121+
// def lat = mr.group(2)
122+
// out.debug "mix: 1.1"
123+
// counts[0] += 1
124+
// cyr.concat(latToCyrMap[lat])
125+
// } } )
126+
//
127+
// // exclusively cyrillic letter preceeded by latin looking like cyrillic
128+
//
129+
//// text.replaceAll(/([aceiopxyABCEHIKMOPTXYáÁéÉíÍḯḮóÓúýÝ])(['’ʼ]?[бвгґдєжзийклмнптфцчшщьюяБГҐДЄЖЗИЙЛПФХЦЧШЩЬЮЯ])/, { all, lat, cyr ->
130+
// def m2 = t1 =~ /([aceiopxyABCEHIKMOPTXYáÁéÉíÍḯḮóÓúýÝ])(['’ʼ]?[бвгґдєжзийклмнптфцчшщьюяѣБГҐДЄЖЗИЙЛПФХЦЧШЩЬЮЯ])/
131+
//// t1 = null // ml
132+
// def t2 = m2.replaceAll( new Function<MatchResult, String>() { String apply(MatchResult mr) { // { mr -> // lat, cyr
133+
// def lat = mr.group(1)
134+
// def cyr = mr.group(2)
135+
// out.debug "mix: 1.2"
136+
// counts[0] += 1
137+
// assert cyr
138+
// latToCyrMap[lat].concat(cyr)
139+
// } } )
140+
// }
141+
//
142+
// @CompileStatic
143+
// String fixReliableLat(String text, int[] counts) {
144+
//// def t1 = text.replaceAll(/([bdfghjklmnrstuvwzDFGJLNQRSUVWZ]['’ʼ]?)([асеіорхуАВСЕНІКМНОРТХУ])/, { all, lat, cyr ->
145+
// def m1 = text =~ /([bdfghjklmnrstuvwzDFGJLNQRSUVWZ]['’ʼ]?)([асеіорхуАВСЕНІКМНОРТХУ])/
146+
// def t1 = m1.replaceAll( new Function<MatchResult, String>() { String apply(MatchResult mr) {
147+
// def lat = mr.group(1)
148+
// def cyr = mr.group(2)
149+
// out.debug "mix: 1.3"
150+
// counts[1] += 2
151+
// assert cyrToLatMap[cyr]
152+
// lat.concat(cyrToLatMap[cyr])
153+
// } } )
154+
//
155+
//// def t2 = t1.replaceAll(/([асеіорхуАВСЕНІКМНОРТХУ])(['’ʼ]?[bdfghjklmnrstuvwzDFGJLNQRSUVWZ])/, { all, cyr, lat ->
156+
// def m2 = t1 =~ /([асеіорхуАВСЕНІКМНОРТХУ])(['’ʼ]?[bdfgjklmnrstuvwzDFGJLNQRSUVWZ])/ // h is often == ѣ
157+
//// t1 = null // ml
158+
// m2.replaceAll( new Function<MatchResult, String>() { String apply(MatchResult mr) {
159+
// def cyr = mr.group(1)
160+
// def lat = mr.group(2)
161+
// out.debug "mix: 1.4"
162+
// counts[1] += 2
163+
// assert lat
164+
// cyrToLatMap[cyr].concat(lat)
165+
// } } )
166+
// }
166167

167168
@CompileStatic
168169
String fixCharBetweenOthers(String text, int[] counts) {
@@ -182,7 +183,7 @@ class LatCyrModule {
182183
}
183184

184185
private static Pattern SMALL_UK_BIG_EN = ~ /([а-яіїєґ])([A-Z])/
185-
private static Pattern ALL_EN_ALL_UK = ~ /([A-Za-z]+)([а-яіїєґ'\u2019\u02bc]+)/
186+
private static Pattern ALL_EN_ALL_UK = ~ /(?U)\b([A-Za-z]+)([а-яіїєґ'\u2019\u02bc]+)\b/
186187

187188
@CompileStatic
188189
String fixToSplit(String text, int[] counts) {
@@ -208,7 +209,7 @@ class LatCyrModule {
208209
if( m2 ) {
209210
def en = m2.group(1)
210211
def uk = m2.group(2)
211-
if( en.length() >= 2 && uk.length() >= 3
212+
if( en.length() >= 3 && uk.length() >= 4
212213
&& ltModule.knownWord(uk)
213214
&& ltModule.knownWordEn(en) ) {
214215
out.debug "mix: 2.2"

src/test/groovy/ua/net/nlp/other/clean/CleanLatCyrTest.groovy

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,17 @@ class CleanLatCyrTest {
6666

6767
// do not touch
6868
assertEquals "Renault Kangoo", clean("Renault Kangoo")
69-
70-
// do not touch
7169
assertEquals "FREЕДОМ", clean("FREЕДОМ")
72-
70+
// Mo - Latin
71+
assertEquals "Moстиск", clean("Moстиск")
72+
// pe - Latin
73+
assertEquals "пеpeсвідчив", clean("пеpeсвідчив")
74+
// cap - Latin
75+
assertEquals "цїcapска", clean("цїcapска")
76+
assertEquals "Pianoбой", clean("Pianoбой")
77+
78+
assertEquals " IІвана", clean(" IІвана")
79+
7380
// do not touch
7481
assertEquals "квадрокоптери Aquila16-fpv-kit.", clean("квадрокоптери Aquila16-fpv-kit.")
7582

0 commit comments

Comments
 (0)