Skip to content

Commit 706f13e

Browse files
committed
drop tag weight for unknown lemmas
1 parent de98afa commit 706f13e

File tree

2 files changed

+17
-8
lines changed

2 files changed

+17
-8
lines changed

src/main/groovy/ua/net/nlp/tools/tag/DisambigStats.groovy

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public class DisambigStats {
2626
private static final String statsFile = "/ua/net/nlp/tools/stats/lemma_freqs_hom.txt"
2727
static final String statsVersion = "3.2.1"
2828

29-
boolean disambigBySuffix = true //DisambigModule.wordEnding in options.disambiguate
29+
boolean disambigBySuffix = true
3030
boolean disambigByContext = true
3131
boolean writeDerivedStats = false
3232

@@ -145,9 +145,9 @@ public class DisambigStats {
145145
double wordRate = getRateByWord(anToken, statsForWord, ti, ctxQ_)
146146
double rate = wordRate
147147

148-
boolean prevPrep = ti.idx > 0 && hasPosTag(ti.tokens[ti.idx-1], "prep")
149-
boolean unforceTag = ! prevPrep && ti.tokens[ti.idx].getCleanToken().endsWith("ів")
150-
148+
boolean unforceTag = statsForWord != null && ! statsForWord.any { wr, stat -> wr.lemma == anToken.lemma }
149+
debugStats(" unforce: %s", unforceTag)
150+
151151
if( ti.idx > 0
152152
&& anToken.getPOSTag().contains(":prop")
153153
&& anToken.getLemma() ==~ /[А-ЯІЇЄҐ][а-яіїєґ'-]{3,}(-[А-ЯІЇЄҐ][а-яіїєґ'-]{3,})?/ ) {
@@ -169,8 +169,8 @@ public class DisambigStats {
169169
sfxRate = getRateBySuffix(anToken, ti, sfx2RateSum, ctxQ, 2)
170170
}
171171
if( sfxRate ) {
172-
sfxRate /= 6.1e3
173-
// sfxRate /= unforceTag ? 6.1e4 : 6.1e3
172+
// sfxRate /= 6.1e3
173+
sfxRate /= unforceTag ? 6.1e5 : 6.1e3
174174
debugStats(" sfx3 rate: -> %f", round(sfxRate))
175175
rate += sfxRate
176176
wordEndingUsed = true
@@ -183,7 +183,7 @@ public class DisambigStats {
183183
double ctxQ = 6.0e7 // 4.5e7
184184
double postagRate = getRateByTag(anToken, ti, withXp, tagRateSum, ctxQ)
185185
if( postagRate ) {
186-
postagRate /= unforceTag ? 6.1e4 : 6.2e3
186+
postagRate /= unforceTag ? 6.1e5 : 6.2e3
187187
debugStats(" tag rate: -> %f", round(postagRate))
188188
rate += postagRate
189189
}

src/test/groovy/ua/net/nlp/tools/TagTextDisambigTest.groovy

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package ua.net.nlp.tools
55
import static org.junit.jupiter.api.Assertions.assertEquals
66
import static org.junit.jupiter.api.Assertions.assertFalse
77
import static org.junit.jupiter.api.Assertions.assertNotEquals
8+
import static org.junit.jupiter.api.Assertions.assertTrue
89
import static org.junit.jupiter.api.Assumptions.assumeTrue
910

1011
import org.checkerframework.framework.qual.IgnoreInWholeProgramInference
@@ -512,7 +513,15 @@ class TagTextDisambigTest {
512513
"""
513514
assertEquals expected, tagged.tagged
514515
}
515-
516+
517+
518+
@Test
519+
public void testPani() {
520+
def tagged = tagText.tagText("й пані Людмили")
521+
522+
assertTrue tagged.tagged.contains("lemma=\"пані\"")
523+
}
524+
516525

517526
// що нижче по схилу
518527
// Дорога забрала

0 commit comments

Comments
 (0)