Skip to content

Commit 64e1fd0

Browse files
committed
Implementing group 3 noun rules for Serbian.
1 parent 367e48e commit 64e1fd0

File tree

3 files changed

+226
-5
lines changed

3 files changed

+226
-5
lines changed

inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.cpp

Lines changed: 203 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,12 @@
1717
#include <inflection/util/LocaleUtils.hpp>
1818
#include <inflection/util/UnicodeSetUtils.hpp>
1919
#include <inflection/npc.hpp>
20+
#include <icu4cxx/RegularExpression.hpp>
21+
#include <array>
2022
#include <iterator>
2123
#include <memory>
24+
#include <string>
25+
#include "SrGrammarSynthesizer_SrDisplayFunction.hpp"
2226

2327
namespace inflection::grammar::synthesis {
2428

@@ -42,7 +46,7 @@ SrGrammarSynthesizer_SrDisplayFunction::~SrGrammarSynthesizer_SrDisplayFunction(
4246
{
4347
}
4448

45-
::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
49+
::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectFromDictionary(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
4650
{
4751
::std::u16string countString(GrammarSynthesizerUtil::getFeatureValue(constraints, numberFeature));
4852
::std::u16string caseString(GrammarSynthesizerUtil::getFeatureValue(constraints, caseFeature));
@@ -61,7 +65,6 @@ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::s
6165
if (!genderString.empty()) {
6266
string_constraints.emplace_back(genderString);
6367
}
64-
// The nominative/caseless is unmarked in the patterns, so we need to do something like this
6568
int64_t wordGrammemes = 0;
6669
dictionary.getCombinedBinaryType(&wordGrammemes, lemma);
6770

@@ -77,7 +80,65 @@ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::s
7780
return inflection;
7881
}
7982

80-
::inflection::dialog::DisplayValue * SrGrammarSynthesizer_SrDisplayFunction::getDisplayValue(const dialog::SemanticFeatureModel_DisplayData &displayData, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool /* enableInflectionGuess */) const
83+
namespace {
84+
85+
// Rule based inflectors for four declination groups.
86+
// Masculine or neuter ending in o or e and masculine ending with consonant.
87+
::std::u16string inflectByRuleOE(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
88+
// Neuter ending in e
89+
::std::u16string inflectByRuleE(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
90+
// All genders ending in a
91+
::std::u16string inflectByRuleA(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase);
92+
// Feminine, ending with consonant
93+
::std::u16string inflectByRuleConsonant(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
94+
95+
// Number of cases in Serbian.
96+
static constexpr auto NUMBER_OF_CASES = 7UL;
97+
98+
// Given the table of all suffixes, both for singular and plural, append suffix to lemma, matching the number and case.
99+
::std::u16string applySuffix(const ::std::u16string&, const ::std::array<::std::u16string, NUMBER_OF_CASES>&, const ::std::array<::std::u16string, NUMBER_OF_CASES>&, const ::std::u16string&, const ::std::u16string&);
100+
// Check if proper noun by checking the first character is capital letter.
101+
bool isProperNoun(const ::std::u16string &lemma);
102+
103+
} // namespace
104+
105+
::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectWithRule(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
106+
{
107+
::std::u16string countString(GrammarSynthesizerUtil::getFeatureValue(constraints, numberFeature));
108+
::std::u16string caseString(GrammarSynthesizerUtil::getFeatureValue(constraints, caseFeature));
109+
auto genderString = GrammarSynthesizerUtil::getFeatureValue(constraints, genderFeature);
110+
111+
::std::u16string inflection;
112+
113+
// If one of singular/plural, case and gender are not specified return lemma.
114+
if (countString.empty() || caseString.empty() || genderString.empty()) {
115+
return lemma;
116+
}
117+
118+
// Do nothing for singular, nominative.
119+
if (countString == GrammemeConstants::NUMBER_SINGULAR() && caseString == GrammemeConstants::CASE_NOMINATIVE()) {
120+
return lemma;
121+
}
122+
123+
// These are four declention groups in the language.
124+
if ((lemma.ends_with(u'о') || lemma.ends_with(u'е')) && (genderString == GrammemeConstants::GENDER_MASCULINE() || genderString == GrammemeConstants::GENDER_NEUTER())) {
125+
inflection = inflectByRuleOE(lemma, countString, caseString, genderString);
126+
} else if (lemma.ends_with(u'е') && genderString == GrammemeConstants::GENDER_NEUTER()) {
127+
inflection = inflectByRuleE(lemma, countString, caseString, genderString);
128+
} else if (lemma.ends_with(u'а')) {
129+
inflection = inflectByRuleA(lemma, countString, caseString);
130+
} else {
131+
inflection = inflectByRuleConsonant(lemma, countString, caseString, genderString);
132+
}
133+
134+
if (inflection.empty()) {
135+
inflection = lemma;
136+
}
137+
138+
return inflection;
139+
}
140+
141+
::inflection::dialog::DisplayValue *SrGrammarSynthesizer_SrDisplayFunction::getDisplayValue(const dialog::SemanticFeatureModel_DisplayData &displayData, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool /* enableInflectionGuess */) const
81142
{
82143
::std::u16string displayString;
83144
if (!displayData.getValues().empty()) {
@@ -87,9 +148,147 @@ ::inflection::dialog::DisplayValue * SrGrammarSynthesizer_SrDisplayFunction::get
87148
return nullptr;
88149
}
89150
if (dictionary.isKnownWord(displayString)) {
90-
displayString = inflectString(constraints, displayString);
151+
displayString = inflectFromDictionary(constraints, displayString);
152+
} else {
153+
// Let's use rule based inflection for nouns. Assume lemma is singular, nominative.
154+
displayString = inflectWithRule(constraints, displayString);
91155
}
92156
return new ::inflection::dialog::DisplayValue(displayString, constraints);
93157
}
94158

159+
namespace {
160+
161+
// Some rules require number of syllables in the word. It's counted as all vowels plus r if in between consonants, or if it starts a word followed by a consonant.
162+
// We care about 1, 2 and more than 2 cases.
163+
enum class Syllables {
164+
ONE_SYLLABLE,
165+
TWO_SYLLABLES,
166+
MULTI_SILLABLES,
167+
};
168+
Syllables countSyllables(const ::std::u16string& lemma) {
169+
static constexpr ::std::u16string_view vowels = u"аеиоуАЕИОУ";
170+
uint16_t total = 0;
171+
// Find vowels.
172+
for (const char16_t ch: lemma) {
173+
if (vowels.find(ch) != ::std::string::npos) {
174+
++total;
175+
}
176+
}
177+
// Find r.
178+
static constexpr ::std::u16string_view regex = u"([^аеиоу]р[^аеиоу])|(^р[^аеиоу])";
179+
::icu4cxx::RegularExpression re(regex, UREGEX_CASE_INSENSITIVE, nullptr);
180+
re.setText(lemma);
181+
while (re.findNext()) {
182+
++total;
183+
}
184+
185+
if (total == 1) {
186+
return Syllables::ONE_SYLLABLE;
187+
} else if (total == 2) {
188+
return Syllables::TWO_SYLLABLES;
189+
} else {
190+
return Syllables::MULTI_SILLABLES;
191+
}
192+
}
193+
194+
::std::u16string inflectByRuleOE(const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase, const ::std::u16string &gender)
195+
{
196+
// TODO(nciric): implement logic.
197+
// Silence unused variable warnings
198+
auto base = number;
199+
base = targetCase;
200+
base = gender;
201+
return lemma;
202+
}
203+
204+
::std::u16string inflectByRuleE(const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase, const ::std::u16string &gender)
205+
{
206+
// TODO(nciric): implement logic.
207+
// Silence unused variable warnings
208+
auto base = number;
209+
base = targetCase;
210+
base = gender;
211+
return lemma;
212+
}
213+
214+
::std::u16string inflectByRuleA(const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase)
215+
{
216+
static constexpr auto suffix_sg = ::std::to_array<::std::u16string>({u"а", u"е", u"и", u"у", u"а", u"ом", u"и"});
217+
static constexpr auto suffix_pl = ::std::to_array<::std::u16string>({u"е", u"а", u"ама", u"е", u"е", u"ама", u"ама"});
218+
219+
::std::u16string base = lemma;
220+
// Remove trailing a and apply suffix.
221+
base.pop_back();
222+
base = applySuffix(base, suffix_sg, suffix_pl, number, targetCase);
223+
224+
// Vocative singular and genitive plural require special processing in some cases.
225+
if (number == GrammemeConstants::NUMBER_SINGULAR() && targetCase == GrammemeConstants::CASE_VOCATIVE()) {
226+
Syllables syllables = countSyllables(lemma);
227+
if (lemma.ends_with(u"ица") && syllables == Syllables::MULTI_SILLABLES) {
228+
base.back() = u'е';
229+
}
230+
if (isProperNoun(lemma) && syllables == Syllables::TWO_SYLLABLES) {
231+
base.back() = u'о';
232+
}
233+
}
234+
235+
if (number == GrammemeConstants::NUMBER_PLURAL() && targetCase == GrammemeConstants::CASE_GENITIVE()) {
236+
if (lemma.ends_with(u"тња") || lemma.ends_with(u"дња") || lemma.ends_with(u"пта") || lemma.ends_with(u"лба") || lemma.ends_with(u"рва")) {
237+
base.back() = u'и';
238+
}
239+
size_t pos = 0;
240+
if ((pos = base.rfind(u"јк")) != ::std::u16string::npos) base.replace(pos, 2, u"јак");
241+
if ((pos = base.rfind(u"мљ")) != ::std::u16string::npos) base.replace(pos, 2, u"маљ");
242+
if ((pos = base.rfind(u"вц")) != ::std::u16string::npos) base.replace(pos, 2, u"вац");
243+
if ((pos = base.rfind(u"тк")) != ::std::u16string::npos) base.replace(pos, 2, u"так");
244+
if ((pos = base.rfind(u"пк")) != ::std::u16string::npos) base.replace(pos, 2, u"пак");
245+
}
246+
247+
return base;
248+
}
249+
250+
::std::u16string inflectByRuleConsonant(const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase, const ::std::u16string &gender)
251+
{
252+
// TODO(nciric): implement logic.
253+
// Silence unused variable warnings
254+
auto base = number;
255+
base = targetCase;
256+
base = gender;
257+
return lemma;
258+
}
259+
260+
::std::u16string applySuffix(const ::std::u16string &lemma, const ::std::array<::std::u16string, NUMBER_OF_CASES>& suffix_sg, const ::std::array<::std::u16string, NUMBER_OF_CASES>& suffix_pl,
261+
const ::std::u16string &number, const ::std::u16string &targetCase)
262+
{
263+
const ::std::map<::std::u16string, size_t> case_index = {
264+
{GrammemeConstants::CASE_NOMINATIVE(), 0},
265+
{GrammemeConstants::CASE_GENITIVE(), 1},
266+
{GrammemeConstants::CASE_DATIVE(), 2},
267+
{GrammemeConstants::CASE_ACCUSATIVE(), 3},
268+
{GrammemeConstants::CASE_VOCATIVE(), 4},
269+
{GrammemeConstants::CASE_INSTRUMENTAL(), 5},
270+
{GrammemeConstants::CASE_LOCATIVE(), 6}
271+
};
272+
273+
auto index = case_index.at(targetCase);
274+
275+
if (number == GrammemeConstants::NUMBER_SINGULAR()) {
276+
return lemma + suffix_sg[index];
277+
} else {
278+
return lemma + suffix_pl[index];
279+
}
280+
}
281+
282+
bool isProperNoun(const ::std::u16string &lemma) {
283+
// Check if first character is in range of Cyrl capital letters.
284+
auto first_ch = lemma.front();
285+
if (0x402 <= first_ch && first_ch <= 0x428) {
286+
return true;
287+
}
288+
289+
return false;
290+
}
291+
292+
} // namespace
293+
95294
} // namespace inflection::grammar::synthesis

inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ class inflection::grammar::synthesis::SrGrammarSynthesizer_SrDisplayFunction
3030
SrGrammarSynthesizer_SrDisplayFunction& operator=(const SrGrammarSynthesizer_SrDisplayFunction&) = delete;
3131

3232
private:
33-
::std::u16string inflectString(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const;
33+
::std::u16string inflectFromDictionary(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const;
34+
::std::u16string inflectWithRule(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const;
3435

3536
const ::inflection::dictionary::DictionaryMetaData& dictionary;
3637
const ::inflection::dialog::SemanticFeature& caseFeature;

inflection/test/resources/inflection/dialog/inflection/sr.xml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,25 @@
1818
<!-- test><source case="vocative" number="singular" gender="masculine" pos="noun">уранак</source><result>уранче</result></test -->
1919
<!-- test><source case="vocative" number="singular" gender="masculine" pos="noun">игроказ</source><result>игрокаже</result></test -->
2020
<!-- test><source case="vocative" number="singular" gender="masculine" pos="noun">пашњак</source><result>пашњаче</result></test -->
21+
<!-- Rule based inflection, group 3, all nouns ending with a -->
22+
<test><source case="instrumental" number="singular" gender="feminine" pos="noun">Италија</source><result>Италијом</result></test>
23+
<test><source case="instrumental" number="singular" gender="feminine" pos="noun">авенија</source><result>авенијом</result></test>
24+
<test><source case="locative" number="plural" gender="feminine" pos="noun">авенија</source><result>авенијама</result></test>
25+
<test><source case="vocative" number="singular" gender="masculine" pos="noun">кадија</source><result>кадија</result></test>
26+
<test><source case="vocative" number="singular" gender="feminine" pos="noun">уметница</source><result>уметнице</result></test>
27+
<test><source case="vocative" number="singular" gender="feminine" pos="noun">птица</source><result>птица</result></test>
28+
<test><source case="vocative" number="singular" gender="feminine" pos="noun">Стана</source><result>Стано</result></test>
29+
<test><source case="vocative" number="singular" gender="feminine" pos="noun">Зора</source><result>Зоро</result></test>
30+
<test><source case="vocative" number="singular" gender="masculine" pos="noun">Божа</source><result>Божо</result></test>
31+
<test><source case="vocative" number="singular" gender="masculine" pos="noun">Љуба</source><result>Љубо</result></test>
32+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">пратња</source><result>пратњи</result></test>
33+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">радња</source><result>радњи</result></test>
34+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">лопта</source><result>лопти</result></test>
35+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">молба</source><result>молби</result></test>
36+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">конзерва</source><result>конзерви</result></test>
37+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">гошћа</source><result>гошћа</result></test>
38+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">двојка</source><result>двојака</result></test>
39+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">битка</source><result>битака</result></test>
40+
<!-- There are some exception, like pripovetka where tk -> dak because of the base word. This has to be dictionary exception -->
41+
<!-- <test><source case="genitive" number="plural" gender="feminine" pos="noun">приповетка</source><result>приповедака</result></test> -->
2142
</inflectionTest>

0 commit comments

Comments
 (0)