Skip to content

Commit 60043f1

Browse files
authored
Implementing group 3 noun rules for Serbian. (#173)
* Implementing group 3 noun rules for Serbian. * Convert to_array to manuall initialization bcs MacOS. * Using u16string_view to avoid allocation in constexpr * Fix spelling * Replace regex with simple loop for perfomance reasons. * Use [[maybe_unused]] on unused parameters. * Remove uneccessary includes, optimize suffix handling code. * Add isConsontant/Vowel functions and simplify code with them. * Enable inflection guess check.
1 parent 367e48e commit 60043f1

File tree

3 files changed

+234
-5
lines changed

3 files changed

+234
-5
lines changed

inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.cpp

Lines changed: 211 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,18 @@
1111
#include <inflection/dialog/SemanticFeatureModel.hpp>
1212
#include <inflection/dialog/SemanticFeatureModel_DisplayData.hpp>
1313
#include <inflection/dialog/DisplayValue.hpp>
14+
#include <inflection/dictionary/PhraseProperties.hpp>
1415
#include <inflection/grammar/synthesis/GrammemeConstants.hpp>
1516
#include <inflection/grammar/synthesis/GrammarSynthesizerUtil.hpp>
1617
#include <inflection/lang/StringFilterUtil.hpp>
1718
#include <inflection/util/LocaleUtils.hpp>
19+
#include <inflection/util/StringViewUtils.hpp>
1820
#include <inflection/util/UnicodeSetUtils.hpp>
1921
#include <inflection/npc.hpp>
22+
#include <array>
2023
#include <iterator>
2124
#include <memory>
25+
#include <string>
2226

2327
namespace inflection::grammar::synthesis {
2428

@@ -42,7 +46,7 @@ SrGrammarSynthesizer_SrDisplayFunction::~SrGrammarSynthesizer_SrDisplayFunction(
4246
{
4347
}
4448

45-
::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
49+
::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectFromDictionary(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
4650
{
4751
::std::u16string countString(GrammarSynthesizerUtil::getFeatureValue(constraints, numberFeature));
4852
::std::u16string caseString(GrammarSynthesizerUtil::getFeatureValue(constraints, caseFeature));
@@ -61,7 +65,6 @@ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::s
6165
if (!genderString.empty()) {
6266
string_constraints.emplace_back(genderString);
6367
}
64-
// The nominative/caseless is unmarked in the patterns, so we need to do something like this
6568
int64_t wordGrammemes = 0;
6669
dictionary.getCombinedBinaryType(&wordGrammemes, lemma);
6770

@@ -77,7 +80,66 @@ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::s
7780
return inflection;
7881
}
7982

80-
::inflection::dialog::DisplayValue * SrGrammarSynthesizer_SrDisplayFunction::getDisplayValue(const dialog::SemanticFeatureModel_DisplayData &displayData, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool /* enableInflectionGuess */) const
83+
namespace {
84+
85+
// Rule based inflectors for four declination groups.
86+
// Masculine or neuter ending in o or e and masculine ending with consonant.
87+
::std::u16string inflectByRuleOE(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
88+
// Neuter ending in e
89+
::std::u16string inflectByRuleE(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
90+
// All genders ending in a
91+
::std::u16string inflectByRuleA(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase);
92+
// Feminine, ending with consonant
93+
::std::u16string inflectByRuleConsonant(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
94+
95+
// Number of cases in Serbian.
96+
static constexpr auto NUMBER_OF_CASES = 7UL;
97+
98+
// Given the table of all suffixes, both for singular and plural, append suffix to lemma, matching the number and case.
99+
::std::u16string applySuffix(const ::std::u16string&, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>&, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>&, const ::std::u16string&, const ::std::u16string&);
100+
101+
// Check if proper noun by checking the first character is capital letter.
102+
bool isProperNoun(const ::std::u16string &lemma);
103+
104+
} // namespace
105+
106+
::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectWithRule(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
107+
{
108+
::std::u16string countString(GrammarSynthesizerUtil::getFeatureValue(constraints, numberFeature));
109+
::std::u16string caseString(GrammarSynthesizerUtil::getFeatureValue(constraints, caseFeature));
110+
auto genderString = GrammarSynthesizerUtil::getFeatureValue(constraints, genderFeature);
111+
112+
::std::u16string inflection;
113+
114+
// If one of singular/plural, case and gender are not specified return lemma.
115+
if (countString.empty() || caseString.empty() || genderString.empty()) {
116+
return lemma;
117+
}
118+
119+
// Do nothing for singular, nominative.
120+
if (countString == GrammemeConstants::NUMBER_SINGULAR() && caseString == GrammemeConstants::CASE_NOMINATIVE()) {
121+
return lemma;
122+
}
123+
124+
// These are four declention groups in the language.
125+
if ((lemma.ends_with(u'о') || lemma.ends_with(u'е')) && (genderString == GrammemeConstants::GENDER_MASCULINE() || genderString == GrammemeConstants::GENDER_NEUTER())) {
126+
inflection = inflectByRuleOE(lemma, countString, caseString, genderString);
127+
} else if (lemma.ends_with(u'е') && genderString == GrammemeConstants::GENDER_NEUTER()) {
128+
inflection = inflectByRuleE(lemma, countString, caseString, genderString);
129+
} else if (lemma.ends_with(u'а')) {
130+
inflection = inflectByRuleA(lemma, countString, caseString);
131+
} else {
132+
inflection = inflectByRuleConsonant(lemma, countString, caseString, genderString);
133+
}
134+
135+
if (inflection.empty()) {
136+
inflection = lemma;
137+
}
138+
139+
return inflection;
140+
}
141+
142+
::inflection::dialog::DisplayValue *SrGrammarSynthesizer_SrDisplayFunction::getDisplayValue(const dialog::SemanticFeatureModel_DisplayData &displayData, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool enableInflectionGuess) const
81143
{
82144
::std::u16string displayString;
83145
if (!displayData.getValues().empty()) {
@@ -87,9 +149,154 @@ ::inflection::dialog::DisplayValue * SrGrammarSynthesizer_SrDisplayFunction::get
87149
return nullptr;
88150
}
89151
if (dictionary.isKnownWord(displayString)) {
90-
displayString = inflectString(constraints, displayString);
152+
displayString = inflectFromDictionary(constraints, displayString);
153+
} else if (enableInflectionGuess) {
154+
// Let's use rule based inflection for nouns. Assume lemma is singular, nominative.
155+
displayString = inflectWithRule(constraints, displayString);
91156
}
92157
return new ::inflection::dialog::DisplayValue(displayString, constraints);
93158
}
94159

160+
namespace {
161+
162+
static bool isConsonant(char16_t ch) {
163+
return ::inflection::lang::StringFilterUtil::CYRILLIC_SCRIPT().contains(ch) && !::inflection::dictionary::PhraseProperties::DEFAULT_VOWELS_START().contains(ch);
164+
}
165+
166+
static bool isVowel(char16_t ch) {
167+
return ::inflection::lang::StringFilterUtil::CYRILLIC_SCRIPT().contains(ch) && ::inflection::dictionary::PhraseProperties::DEFAULT_VOWELS_START().contains(ch);
168+
}
169+
170+
// Some rules require number of syllables in the word. It's counted as all vowels plus r if in between consonants, or if it starts a word followed by a consonant.
171+
// We care about 1, 2 and more than 2 cases.
172+
enum class Syllables {
173+
ONE_SYLLABLE,
174+
TWO_SYLLABLES,
175+
MULTI_SYLLABLES,
176+
};
177+
Syllables countSyllables(const ::std::u16string& lemma) {
178+
uint16_t total = 0;
179+
size_t index = 0;
180+
const size_t length = lemma.length();
181+
for (const char16_t ch: lemma) {
182+
if (isVowel(ch)) {
183+
++total;
184+
}
185+
// Check case where R is at the begining followed by a consonant.
186+
if ((ch == u'р' || ch == u'Р') && (index == 0 && index + 1 < length)) {
187+
if (isConsonant(lemma[index + 1])) {
188+
++total;
189+
}
190+
} else if ((ch == u'р' || ch == u'Р') && (index != 0 && index + 1 < length)) {
191+
if (isConsonant(lemma[index - 1]) && isConsonant(lemma[index + 1])) {
192+
++total;
193+
}
194+
}
195+
++index;
196+
}
197+
198+
if (total == 1) {
199+
return Syllables::ONE_SYLLABLE;
200+
} else if (total == 2) {
201+
return Syllables::TWO_SYLLABLES;
202+
} else {
203+
return Syllables::MULTI_SYLLABLES;
204+
}
205+
}
206+
207+
::std::u16string inflectByRuleOE(const ::std::u16string &lemma, [[maybe_unused]] const ::std::u16string &number, [[maybe_unused]] const ::std::u16string &targetCase, [[maybe_unused]] const ::std::u16string &gender)
208+
{
209+
// TODO(nciric): implement logic.
210+
return lemma;
211+
}
212+
213+
::std::u16string inflectByRuleE(const ::std::u16string &lemma, [[maybe_unused]] const ::std::u16string &number, [[maybe_unused]] const ::std::u16string &targetCase, [[maybe_unused]] const ::std::u16string &gender)
214+
{
215+
// TODO(nciric): implement logic.
216+
return lemma;
217+
}
218+
219+
::std::u16string inflectByRuleA(const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase)
220+
{
221+
static constexpr auto suffix_sg = ::std::to_array<::std::u16string_view>({u"а", u"е", u"и", u"у", u"а", u"ом", u"и"});
222+
static constexpr auto suffix_pl = ::std::to_array<::std::u16string_view>({u"е", u"а", u"ама", u"е", u"е", u"ама", u"ама"});
223+
224+
::std::u16string base = lemma;
225+
// Remove trailing a and apply suffix.
226+
base.pop_back();
227+
base = applySuffix(base, suffix_sg, suffix_pl, number, targetCase);
228+
229+
// Vocative singular and genitive plural require special processing in some cases.
230+
if (number == GrammemeConstants::NUMBER_SINGULAR() && targetCase == GrammemeConstants::CASE_VOCATIVE()) {
231+
Syllables syllables = countSyllables(lemma);
232+
if (lemma.ends_with(u"ица") && syllables == Syllables::MULTI_SYLLABLES) {
233+
base.back() = u'е';
234+
}
235+
if (isProperNoun(lemma) && syllables == Syllables::TWO_SYLLABLES) {
236+
base.back() = u'о';
237+
}
238+
}
239+
240+
if (number == GrammemeConstants::NUMBER_PLURAL() && targetCase == GrammemeConstants::CASE_GENITIVE()) {
241+
if (lemma.ends_with(u"тња") || lemma.ends_with(u"дња") || lemma.ends_with(u"пта") || lemma.ends_with(u"лба") || lemma.ends_with(u"рва")) {
242+
base.back() = u'и';
243+
}
244+
static const char16_t *mappings[][2] = {
245+
{u"јка", u"јака"},
246+
{u"мља", u"маља"},
247+
{u"вца", u"ваца"},
248+
{u"тка", u"така"},
249+
{u"пка", u"пака"},
250+
};
251+
for (const auto &[suffix, replacement] : mappings) {
252+
if (base.ends_with(suffix)) {
253+
auto suffix_length = std::u16string_view(suffix).length();
254+
base.replace(base.length() - suffix_length, suffix_length, replacement);
255+
}
256+
}
257+
}
258+
259+
return base;
260+
}
261+
262+
::std::u16string inflectByRuleConsonant(const ::std::u16string &lemma, [[maybe_unused]] const ::std::u16string &number, [[maybe_unused]] const ::std::u16string &targetCase, [[maybe_unused]] const ::std::u16string & gender)
263+
{
264+
// TODO(nciric): implement logic.
265+
return lemma;
266+
}
267+
268+
::std::u16string applySuffix(const ::std::u16string &lemma, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>& suffix_sg, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>& suffix_pl,
269+
const ::std::u16string &number, const ::std::u16string &targetCase)
270+
{
271+
const ::std::map<::std::u16string, size_t> case_index = {
272+
{GrammemeConstants::CASE_NOMINATIVE(), 0},
273+
{GrammemeConstants::CASE_GENITIVE(), 1},
274+
{GrammemeConstants::CASE_DATIVE(), 2},
275+
{GrammemeConstants::CASE_ACCUSATIVE(), 3},
276+
{GrammemeConstants::CASE_VOCATIVE(), 4},
277+
{GrammemeConstants::CASE_INSTRUMENTAL(), 5},
278+
{GrammemeConstants::CASE_LOCATIVE(), 6}
279+
};
280+
281+
auto index = case_index.at(targetCase);
282+
283+
if (number == GrammemeConstants::NUMBER_SINGULAR()) {
284+
return lemma + ::std::u16string(suffix_sg[index]);
285+
} else {
286+
return lemma + ::std::u16string(suffix_pl[index]);
287+
}
288+
}
289+
290+
bool isProperNoun(const ::std::u16string &lemma) {
291+
// Check if first character is in range of Cyrl capital letters.
292+
auto first_ch = lemma.front();
293+
if (0x402 <= first_ch && first_ch <= 0x428) {
294+
return true;
295+
}
296+
297+
return false;
298+
}
299+
300+
} // namespace
301+
95302
} // namespace inflection::grammar::synthesis

inflection/src/inflection/grammar/synthesis/SrGrammarSynthesizer_SrDisplayFunction.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ class inflection::grammar::synthesis::SrGrammarSynthesizer_SrDisplayFunction
3030
SrGrammarSynthesizer_SrDisplayFunction& operator=(const SrGrammarSynthesizer_SrDisplayFunction&) = delete;
3131

3232
private:
33-
::std::u16string inflectString(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const;
33+
::std::u16string inflectFromDictionary(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const;
34+
::std::u16string inflectWithRule(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const;
3435

3536
const ::inflection::dictionary::DictionaryMetaData& dictionary;
3637
const ::inflection::dialog::SemanticFeature& caseFeature;

inflection/test/resources/inflection/dialog/inflection/sr.xml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,25 @@
1818
<!-- test><source case="vocative" number="singular" gender="masculine" pos="noun">уранак</source><result>уранче</result></test -->
1919
<!-- test><source case="vocative" number="singular" gender="masculine" pos="noun">игроказ</source><result>игрокаже</result></test -->
2020
<!-- test><source case="vocative" number="singular" gender="masculine" pos="noun">пашњак</source><result>пашњаче</result></test -->
21+
<!-- Rule based inflection, group 3, all nouns ending with a -->
22+
<test><source case="instrumental" number="singular" gender="feminine" pos="noun">Италија</source><result>Италијом</result></test>
23+
<test><source case="instrumental" number="singular" gender="feminine" pos="noun">авенија</source><result>авенијом</result></test>
24+
<test><source case="locative" number="plural" gender="feminine" pos="noun">авенија</source><result>авенијама</result></test>
25+
<test><source case="vocative" number="singular" gender="masculine" pos="noun">кадија</source><result>кадија</result></test>
26+
<test><source case="vocative" number="singular" gender="feminine" pos="noun">уметница</source><result>уметнице</result></test>
27+
<test><source case="vocative" number="singular" gender="feminine" pos="noun">птица</source><result>птица</result></test>
28+
<test><source case="vocative" number="singular" gender="feminine" pos="noun">Стана</source><result>Стано</result></test>
29+
<test><source case="vocative" number="singular" gender="feminine" pos="noun">Зора</source><result>Зоро</result></test>
30+
<test><source case="vocative" number="singular" gender="masculine" pos="noun">Божа</source><result>Божо</result></test>
31+
<test><source case="vocative" number="singular" gender="masculine" pos="noun">Љуба</source><result>Љубо</result></test>
32+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">пратња</source><result>пратњи</result></test>
33+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">радња</source><result>радњи</result></test>
34+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">лопта</source><result>лопти</result></test>
35+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">молба</source><result>молби</result></test>
36+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">конзерва</source><result>конзерви</result></test>
37+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">гошћа</source><result>гошћа</result></test>
38+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">двојка</source><result>двојака</result></test>
39+
<test><source case="genitive" number="plural" gender="feminine" pos="noun">битка</source><result>битака</result></test>
40+
<!-- There are some exception, like pripovetka where tk -> dak because of the base word. This has to be dictionary exception -->
41+
<!-- <test><source case="genitive" number="plural" gender="feminine" pos="noun">приповетка</source><result>приповедака</result></test> -->
2142
</inflectionTest>

0 commit comments

Comments
 (0)