1717#include < inflection/util/LocaleUtils.hpp>
1818#include < inflection/util/UnicodeSetUtils.hpp>
1919#include < inflection/npc.hpp>
20+ #include < icu4cxx/RegularExpression.hpp>
21+ #include < array>
2022#include < iterator>
2123#include < memory>
24+ #include < string>
25+ #include " SrGrammarSynthesizer_SrDisplayFunction.hpp"
2226
2327namespace inflection ::grammar::synthesis {
2428
@@ -42,7 +46,7 @@ SrGrammarSynthesizer_SrDisplayFunction::~SrGrammarSynthesizer_SrDisplayFunction(
4246{
4347}
4448
45- ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString (const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
49+ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectFromDictionary (const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
4650{
4751 ::std::u16string countString (GrammarSynthesizerUtil::getFeatureValue (constraints, numberFeature));
4852 ::std::u16string caseString (GrammarSynthesizerUtil::getFeatureValue (constraints, caseFeature));
@@ -61,7 +65,6 @@ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::s
6165 if (!genderString.empty ()) {
6266 string_constraints.emplace_back (genderString);
6367 }
64- // The nominative/caseless is unmarked in the patterns, so we need to do something like this
6568 int64_t wordGrammemes = 0 ;
6669 dictionary.getCombinedBinaryType (&wordGrammemes, lemma);
6770
@@ -77,7 +80,65 @@ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::s
7780 return inflection;
7881}
7982
80- ::inflection::dialog::DisplayValue * SrGrammarSynthesizer_SrDisplayFunction::getDisplayValue (const dialog::SemanticFeatureModel_DisplayData &displayData, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool /* enableInflectionGuess */ ) const
83+ namespace {
84+
85+ // Rule based inflectors for four declination groups.
86+ // Masculine or neuter ending in o or e and masculine ending with consonant.
87+ ::std::u16string inflectByRuleOE (const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
88+ // Neuter ending in e
89+ ::std::u16string inflectByRuleE (const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
90+ // All genders ending in a
91+ ::std::u16string inflectByRuleA (const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase);
92+ // Feminine, ending with consonant
93+ ::std::u16string inflectByRuleConsonant (const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
94+
95+ // Number of cases in Serbian.
96+ static constexpr auto NUMBER_OF_CASES = 7UL ;
97+
98+ // Given the table of all suffixes, both for singular and plural, append suffix to lemma, matching the number and case.
99+ ::std::u16string applySuffix (const ::std::u16string&, const ::std::array<::std::u16string, NUMBER_OF_CASES>&, const ::std::array<::std::u16string, NUMBER_OF_CASES>&, const ::std::u16string&, const ::std::u16string&);
100+ // Check if proper noun by checking the first character is capital letter.
101+ bool isProperNoun (const ::std::u16string &lemma);
102+
103+ } // namespace
104+
105+ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectWithRule (const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
106+ {
107+ ::std::u16string countString (GrammarSynthesizerUtil::getFeatureValue (constraints, numberFeature));
108+ ::std::u16string caseString (GrammarSynthesizerUtil::getFeatureValue (constraints, caseFeature));
109+ auto genderString = GrammarSynthesizerUtil::getFeatureValue (constraints, genderFeature);
110+
111+ ::std::u16string inflection;
112+
113+ // If one of singular/plural, case and gender are not specified return lemma.
114+ if (countString.empty () || caseString.empty () || genderString.empty ()) {
115+ return lemma;
116+ }
117+
118+ // Do nothing for singular, nominative.
119+ if (countString == GrammemeConstants::NUMBER_SINGULAR () && caseString == GrammemeConstants::CASE_NOMINATIVE ()) {
120+ return lemma;
121+ }
122+
123+ // These are four declention groups in the language.
124+ if ((lemma.ends_with (u' о' ) || lemma.ends_with (u' е' )) && (genderString == GrammemeConstants::GENDER_MASCULINE () || genderString == GrammemeConstants::GENDER_NEUTER ())) {
125+ inflection = inflectByRuleOE (lemma, countString, caseString, genderString);
126+ } else if (lemma.ends_with (u' е' ) && genderString == GrammemeConstants::GENDER_NEUTER ()) {
127+ inflection = inflectByRuleE (lemma, countString, caseString, genderString);
128+ } else if (lemma.ends_with (u' а' )) {
129+ inflection = inflectByRuleA (lemma, countString, caseString);
130+ } else {
131+ inflection = inflectByRuleConsonant (lemma, countString, caseString, genderString);
132+ }
133+
134+ if (inflection.empty ()) {
135+ inflection = lemma;
136+ }
137+
138+ return inflection;
139+ }
140+
141+ ::inflection::dialog::DisplayValue *SrGrammarSynthesizer_SrDisplayFunction::getDisplayValue (const dialog::SemanticFeatureModel_DisplayData &displayData, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool /* enableInflectionGuess */ ) const
81142{
82143 ::std::u16string displayString;
83144 if (!displayData.getValues ().empty ()) {
@@ -87,9 +148,147 @@ ::inflection::dialog::DisplayValue * SrGrammarSynthesizer_SrDisplayFunction::get
87148 return nullptr ;
88149 }
89150 if (dictionary.isKnownWord (displayString)) {
90- displayString = inflectString (constraints, displayString);
151+ displayString = inflectFromDictionary (constraints, displayString);
152+ } else {
153+ // Let's use rule based inflection for nouns. Assume lemma is singular, nominative.
154+ displayString = inflectWithRule (constraints, displayString);
91155 }
92156 return new ::inflection::dialog::DisplayValue (displayString, constraints);
93157}
94158
159+ namespace {
160+
161+ // Some rules require number of syllables in the word. It's counted as all vowels plus r if in between consonants, or if it starts a word followed by a consonant.
162+ // We care about 1, 2 and more than 2 cases.
163+ enum class Syllables {
164+ ONE_SYLLABLE,
165+ TWO_SYLLABLES,
166+ MULTI_SILLABLES,
167+ };
168+ Syllables countSyllables (const ::std::u16string& lemma) {
169+ static constexpr ::std::u16string_view vowels = u" аеиоуАЕИОУ" ;
170+ uint16_t total = 0 ;
171+ // Find vowels.
172+ for (const char16_t ch: lemma) {
173+ if (vowels.find (ch) != ::std::string::npos) {
174+ ++total;
175+ }
176+ }
177+ // Find r.
178+ static constexpr ::std::u16string_view regex = u" ([^аеиоу]р[^аеиоу])|(^р[^аеиоу])" ;
179+ ::icu4cxx::RegularExpression re (regex, UREGEX_CASE_INSENSITIVE, nullptr );
180+ re.setText (lemma);
181+ while (re.findNext ()) {
182+ ++total;
183+ }
184+
185+ if (total == 1 ) {
186+ return Syllables::ONE_SYLLABLE;
187+ } else if (total == 2 ) {
188+ return Syllables::TWO_SYLLABLES;
189+ } else {
190+ return Syllables::MULTI_SILLABLES;
191+ }
192+ }
193+
194+ ::std::u16string inflectByRuleOE (const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase, const ::std::u16string &gender)
195+ {
196+ // TODO(nciric): implement logic.
197+ // Silence unused variable warnings
198+ auto base = number;
199+ base = targetCase;
200+ base = gender;
201+ return lemma;
202+ }
203+
204+ ::std::u16string inflectByRuleE (const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase, const ::std::u16string &gender)
205+ {
206+ // TODO(nciric): implement logic.
207+ // Silence unused variable warnings
208+ auto base = number;
209+ base = targetCase;
210+ base = gender;
211+ return lemma;
212+ }
213+
214+ ::std::u16string inflectByRuleA (const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase)
215+ {
216+ static constexpr auto suffix_sg = ::std::to_array<::std::u16string>({u" а" , u" е" , u" и" , u" у" , u" а" , u" ом" , u" и" });
217+ static constexpr auto suffix_pl = ::std::to_array<::std::u16string>({u" е" , u" а" , u" ама" , u" е" , u" е" , u" ама" , u" ама" });
218+
219+ ::std::u16string base = lemma;
220+ // Remove trailing a and apply suffix.
221+ base.pop_back ();
222+ base = applySuffix (base, suffix_sg, suffix_pl, number, targetCase);
223+
224+ // Vocative singular and genitive plural require special processing in some cases.
225+ if (number == GrammemeConstants::NUMBER_SINGULAR () && targetCase == GrammemeConstants::CASE_VOCATIVE ()) {
226+ Syllables syllables = countSyllables (lemma);
227+ if (lemma.ends_with (u" ица" ) && syllables == Syllables::MULTI_SILLABLES) {
228+ base.back () = u' е' ;
229+ }
230+ if (isProperNoun (lemma) && syllables == Syllables::TWO_SYLLABLES) {
231+ base.back () = u' о' ;
232+ }
233+ }
234+
235+ if (number == GrammemeConstants::NUMBER_PLURAL () && targetCase == GrammemeConstants::CASE_GENITIVE ()) {
236+ if (lemma.ends_with (u" тња" ) || lemma.ends_with (u" дња" ) || lemma.ends_with (u" пта" ) || lemma.ends_with (u" лба" ) || lemma.ends_with (u" рва" )) {
237+ base.back () = u' и' ;
238+ }
239+ size_t pos = 0 ;
240+ if ((pos = base.rfind (u" јк" )) != ::std::u16string::npos) base.replace (pos, 2 , u" јак" );
241+ if ((pos = base.rfind (u" мљ" )) != ::std::u16string::npos) base.replace (pos, 2 , u" маљ" );
242+ if ((pos = base.rfind (u" вц" )) != ::std::u16string::npos) base.replace (pos, 2 , u" вац" );
243+ if ((pos = base.rfind (u" тк" )) != ::std::u16string::npos) base.replace (pos, 2 , u" так" );
244+ if ((pos = base.rfind (u" пк" )) != ::std::u16string::npos) base.replace (pos, 2 , u" пак" );
245+ }
246+
247+ return base;
248+ }
249+
250+ ::std::u16string inflectByRuleConsonant (const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase, const ::std::u16string &gender)
251+ {
252+ // TODO(nciric): implement logic.
253+ // Silence unused variable warnings
254+ auto base = number;
255+ base = targetCase;
256+ base = gender;
257+ return lemma;
258+ }
259+
260+ ::std::u16string applySuffix (const ::std::u16string &lemma, const ::std::array<::std::u16string, NUMBER_OF_CASES>& suffix_sg, const ::std::array<::std::u16string, NUMBER_OF_CASES>& suffix_pl,
261+ const ::std::u16string &number, const ::std::u16string &targetCase)
262+ {
263+ const ::std::map<::std::u16string, size_t > case_index = {
264+ {GrammemeConstants::CASE_NOMINATIVE (), 0 },
265+ {GrammemeConstants::CASE_GENITIVE (), 1 },
266+ {GrammemeConstants::CASE_DATIVE (), 2 },
267+ {GrammemeConstants::CASE_ACCUSATIVE (), 3 },
268+ {GrammemeConstants::CASE_VOCATIVE (), 4 },
269+ {GrammemeConstants::CASE_INSTRUMENTAL (), 5 },
270+ {GrammemeConstants::CASE_LOCATIVE (), 6 }
271+ };
272+
273+ auto index = case_index.at (targetCase);
274+
275+ if (number == GrammemeConstants::NUMBER_SINGULAR ()) {
276+ return lemma + suffix_sg[index];
277+ } else {
278+ return lemma + suffix_pl[index];
279+ }
280+ }
281+
282+ bool isProperNoun (const ::std::u16string &lemma) {
283+ // Check if first character is in range of Cyrl capital letters.
284+ auto first_ch = lemma.front ();
285+ if (0x402 <= first_ch && first_ch <= 0x428 ) {
286+ return true ;
287+ }
288+
289+ return false ;
290+ }
291+
292+ } // namespace
293+
95294} // namespace inflection::grammar::synthesis
0 commit comments