Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,18 @@
#include <inflection/dialog/SemanticFeatureModel.hpp>
#include <inflection/dialog/SemanticFeatureModel_DisplayData.hpp>
#include <inflection/dialog/DisplayValue.hpp>
#include <inflection/dictionary/PhraseProperties.hpp>
#include <inflection/grammar/synthesis/GrammemeConstants.hpp>
#include <inflection/grammar/synthesis/GrammarSynthesizerUtil.hpp>
#include <inflection/lang/StringFilterUtil.hpp>
#include <inflection/util/LocaleUtils.hpp>
#include <inflection/util/StringViewUtils.hpp>
#include <inflection/util/UnicodeSetUtils.hpp>
#include <inflection/npc.hpp>
#include <array>
#include <iterator>
#include <memory>
#include <string>

namespace inflection::grammar::synthesis {

Expand All @@ -42,7 +46,7 @@ SrGrammarSynthesizer_SrDisplayFunction::~SrGrammarSynthesizer_SrDisplayFunction(
{
}

::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectFromDictionary(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
{
::std::u16string countString(GrammarSynthesizerUtil::getFeatureValue(constraints, numberFeature));
::std::u16string caseString(GrammarSynthesizerUtil::getFeatureValue(constraints, caseFeature));
Expand All @@ -61,7 +65,6 @@ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::s
if (!genderString.empty()) {
string_constraints.emplace_back(genderString);
}
// The nominative/caseless is unmarked in the patterns, so we need to do something like this
int64_t wordGrammemes = 0;
dictionary.getCombinedBinaryType(&wordGrammemes, lemma);

Expand All @@ -77,7 +80,66 @@ ::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectString(const ::s
return inflection;
}

::inflection::dialog::DisplayValue * SrGrammarSynthesizer_SrDisplayFunction::getDisplayValue(const dialog::SemanticFeatureModel_DisplayData &displayData, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool /* enableInflectionGuess */) const
namespace {

// Rule based inflectors for four declination groups.
// Masculine or neuter ending in o or e and masculine ending with consonant.
::std::u16string inflectByRuleOE(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
// Neuter ending in e
::std::u16string inflectByRuleE(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);
// All genders ending in a
::std::u16string inflectByRuleA(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase);
// Feminine, ending with consonant
::std::u16string inflectByRuleConsonant(const ::std::u16string& lemma, const ::std::u16string& number, const ::std::u16string& targetCase, const ::std::u16string& gender);

// Number of cases in Serbian.
static constexpr auto NUMBER_OF_CASES = 7UL;

// Given the table of all suffixes, both for singular and plural, append suffix to lemma, matching the number and case.
::std::u16string applySuffix(const ::std::u16string&, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>&, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>&, const ::std::u16string&, const ::std::u16string&);

// Check if proper noun by checking the first character is capital letter.
bool isProperNoun(const ::std::u16string &lemma);

} // namespace

::std::u16string SrGrammarSynthesizer_SrDisplayFunction::inflectWithRule(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const
{
::std::u16string countString(GrammarSynthesizerUtil::getFeatureValue(constraints, numberFeature));
::std::u16string caseString(GrammarSynthesizerUtil::getFeatureValue(constraints, caseFeature));
auto genderString = GrammarSynthesizerUtil::getFeatureValue(constraints, genderFeature);

::std::u16string inflection;

// If one of singular/plural, case and gender are not specified return lemma.
if (countString.empty() || caseString.empty() || genderString.empty()) {
return lemma;
}

// Do nothing for singular, nominative.
if (countString == GrammemeConstants::NUMBER_SINGULAR() && caseString == GrammemeConstants::CASE_NOMINATIVE()) {
return lemma;
}

// These are four declention groups in the language.
if ((lemma.ends_with(u'о') || lemma.ends_with(u'е')) && (genderString == GrammemeConstants::GENDER_MASCULINE() || genderString == GrammemeConstants::GENDER_NEUTER())) {
inflection = inflectByRuleOE(lemma, countString, caseString, genderString);
} else if (lemma.ends_with(u'е') && genderString == GrammemeConstants::GENDER_NEUTER()) {
inflection = inflectByRuleE(lemma, countString, caseString, genderString);
} else if (lemma.ends_with(u'а')) {
inflection = inflectByRuleA(lemma, countString, caseString);
} else {
inflection = inflectByRuleConsonant(lemma, countString, caseString, genderString);
}

if (inflection.empty()) {
inflection = lemma;
}

return inflection;
}

::inflection::dialog::DisplayValue *SrGrammarSynthesizer_SrDisplayFunction::getDisplayValue(const dialog::SemanticFeatureModel_DisplayData &displayData, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool enableInflectionGuess) const
{
::std::u16string displayString;
if (!displayData.getValues().empty()) {
Expand All @@ -87,9 +149,154 @@ ::inflection::dialog::DisplayValue * SrGrammarSynthesizer_SrDisplayFunction::get
return nullptr;
}
if (dictionary.isKnownWord(displayString)) {
displayString = inflectString(constraints, displayString);
displayString = inflectFromDictionary(constraints, displayString);
} else if (enableInflectionGuess) {
// Let's use rule based inflection for nouns. Assume lemma is singular, nominative.
displayString = inflectWithRule(constraints, displayString);
}
return new ::inflection::dialog::DisplayValue(displayString, constraints);
}

namespace {

static bool isConsonant(char16_t ch) {
return ::inflection::lang::StringFilterUtil::CYRILLIC_SCRIPT().contains(ch) && !::inflection::dictionary::PhraseProperties::DEFAULT_VOWELS_START().contains(ch);
}

static bool isVowel(char16_t ch) {
return ::inflection::lang::StringFilterUtil::CYRILLIC_SCRIPT().contains(ch) && ::inflection::dictionary::PhraseProperties::DEFAULT_VOWELS_START().contains(ch);
}

// Some rules require number of syllables in the word. It's counted as all vowels plus r if in between consonants, or if it starts a word followed by a consonant.
// We care about 1, 2 and more than 2 cases.
enum class Syllables {
ONE_SYLLABLE,
TWO_SYLLABLES,
MULTI_SYLLABLES,
};
Syllables countSyllables(const ::std::u16string& lemma) {
uint16_t total = 0;
size_t index = 0;
const size_t length = lemma.length();
for (const char16_t ch: lemma) {
if (isVowel(ch)) {
++total;
}
// Check case where R is at the begining followed by a consonant.
if ((ch == u'р' || ch == u'Р') && (index == 0 && index + 1 < length)) {
if (isConsonant(lemma[index + 1])) {
++total;
}
} else if ((ch == u'р' || ch == u'Р') && (index != 0 && index + 1 < length)) {
if (isConsonant(lemma[index - 1]) && isConsonant(lemma[index + 1])) {
++total;
}
}
++index;
}

if (total == 1) {
return Syllables::ONE_SYLLABLE;
} else if (total == 2) {
return Syllables::TWO_SYLLABLES;
} else {
return Syllables::MULTI_SYLLABLES;
}
}

::std::u16string inflectByRuleOE(const ::std::u16string &lemma, [[maybe_unused]] const ::std::u16string &number, [[maybe_unused]] const ::std::u16string &targetCase, [[maybe_unused]] const ::std::u16string &gender)
{
// TODO(nciric): implement logic.
return lemma;
}

::std::u16string inflectByRuleE(const ::std::u16string &lemma, [[maybe_unused]] const ::std::u16string &number, [[maybe_unused]] const ::std::u16string &targetCase, [[maybe_unused]] const ::std::u16string &gender)
{
// TODO(nciric): implement logic.
return lemma;
}

::std::u16string inflectByRuleA(const ::std::u16string &lemma, const ::std::u16string &number, const ::std::u16string &targetCase)
{
static constexpr auto suffix_sg = ::std::to_array<::std::u16string_view>({u"а", u"е", u"и", u"у", u"а", u"ом", u"и"});
static constexpr auto suffix_pl = ::std::to_array<::std::u16string_view>({u"е", u"а", u"ама", u"е", u"е", u"ама", u"ама"});

::std::u16string base = lemma;
// Remove trailing a and apply suffix.
base.pop_back();
base = applySuffix(base, suffix_sg, suffix_pl, number, targetCase);
Comment on lines +221 to +227
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this kind of mapping, you may be inspired by Arabic, German or Italian. They convert a string to a numeric key (makeLookupKey) containing multiple grammemes, and they map the key to a string. This mapping is initialized in the constructor instead of at runtime.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the concern the runtime size increase (static constexpr)? If yes, I can remove the static (creating these arrays is cheap).
Otherwise the current approach looks simpler. I will look into refactoring this code as I add more cases, potentially implementing Arabic like approach.

WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was to make it more scalable, but this is fine too.


// Vocative singular and genitive plural require special processing in some cases.
if (number == GrammemeConstants::NUMBER_SINGULAR() && targetCase == GrammemeConstants::CASE_VOCATIVE()) {
Syllables syllables = countSyllables(lemma);
if (lemma.ends_with(u"ица") && syllables == Syllables::MULTI_SYLLABLES) {
base.back() = u'е';
}
if (isProperNoun(lemma) && syllables == Syllables::TWO_SYLLABLES) {
base.back() = u'о';
}
}

if (number == GrammemeConstants::NUMBER_PLURAL() && targetCase == GrammemeConstants::CASE_GENITIVE()) {
if (lemma.ends_with(u"тња") || lemma.ends_with(u"дња") || lemma.ends_with(u"пта") || lemma.ends_with(u"лба") || lemma.ends_with(u"рва")) {
base.back() = u'и';
}
static const char16_t *mappings[][2] = {
{u"јка", u"јака"},
{u"мља", u"маља"},
{u"вца", u"ваца"},
{u"тка", u"така"},
{u"пка", u"пака"},
};
for (const auto &[suffix, replacement] : mappings) {
if (base.ends_with(suffix)) {
auto suffix_length = std::u16string_view(suffix).length();
base.replace(base.length() - suffix_length, suffix_length, replacement);
}
}
}

return base;
}

::std::u16string inflectByRuleConsonant(const ::std::u16string &lemma, [[maybe_unused]] const ::std::u16string &number, [[maybe_unused]] const ::std::u16string &targetCase, [[maybe_unused]] const ::std::u16string & gender)
{
// TODO(nciric): implement logic.
return lemma;
}

::std::u16string applySuffix(const ::std::u16string &lemma, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>& suffix_sg, const ::std::array<::std::u16string_view, NUMBER_OF_CASES>& suffix_pl,
const ::std::u16string &number, const ::std::u16string &targetCase)
{
const ::std::map<::std::u16string, size_t> case_index = {
{GrammemeConstants::CASE_NOMINATIVE(), 0},
{GrammemeConstants::CASE_GENITIVE(), 1},
{GrammemeConstants::CASE_DATIVE(), 2},
{GrammemeConstants::CASE_ACCUSATIVE(), 3},
{GrammemeConstants::CASE_VOCATIVE(), 4},
{GrammemeConstants::CASE_INSTRUMENTAL(), 5},
{GrammemeConstants::CASE_LOCATIVE(), 6}
};

auto index = case_index.at(targetCase);

if (number == GrammemeConstants::NUMBER_SINGULAR()) {
return lemma + ::std::u16string(suffix_sg[index]);
} else {
return lemma + ::std::u16string(suffix_pl[index]);
}
}

bool isProperNoun(const ::std::u16string &lemma) {
// Check if first character is in range of Cyrl capital letters.
auto first_ch = lemma.front();
if (0x402 <= first_ch && first_ch <= 0x428) {
return true;
}

return false;
}

} // namespace

} // namespace inflection::grammar::synthesis
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ class inflection::grammar::synthesis::SrGrammarSynthesizer_SrDisplayFunction
SrGrammarSynthesizer_SrDisplayFunction& operator=(const SrGrammarSynthesizer_SrDisplayFunction&) = delete;

private:
::std::u16string inflectString(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const;
::std::u16string inflectFromDictionary(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const;
::std::u16string inflectWithRule(const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& constraints, const ::std::u16string& lemma) const;

const ::inflection::dictionary::DictionaryMetaData& dictionary;
const ::inflection::dialog::SemanticFeature& caseFeature;
Expand Down
21 changes: 21 additions & 0 deletions inflection/test/resources/inflection/dialog/inflection/sr.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,25 @@
<!-- test><source case="vocative" number="singular" gender="masculine" pos="noun">уранак</source><result>уранче</result></test -->
<!-- test><source case="vocative" number="singular" gender="masculine" pos="noun">игроказ</source><result>игрокаже</result></test -->
<!-- test><source case="vocative" number="singular" gender="masculine" pos="noun">пашњак</source><result>пашњаче</result></test -->
<!-- Rule based inflection, group 3, all nouns ending with a -->
<test><source case="instrumental" number="singular" gender="feminine" pos="noun">Италија</source><result>Италијом</result></test>
<test><source case="instrumental" number="singular" gender="feminine" pos="noun">авенија</source><result>авенијом</result></test>
<test><source case="locative" number="plural" gender="feminine" pos="noun">авенија</source><result>авенијама</result></test>
<test><source case="vocative" number="singular" gender="masculine" pos="noun">кадија</source><result>кадија</result></test>
<test><source case="vocative" number="singular" gender="feminine" pos="noun">уметница</source><result>уметнице</result></test>
<test><source case="vocative" number="singular" gender="feminine" pos="noun">птица</source><result>птица</result></test>
<test><source case="vocative" number="singular" gender="feminine" pos="noun">Стана</source><result>Стано</result></test>
<test><source case="vocative" number="singular" gender="feminine" pos="noun">Зора</source><result>Зоро</result></test>
<test><source case="vocative" number="singular" gender="masculine" pos="noun">Божа</source><result>Божо</result></test>
<test><source case="vocative" number="singular" gender="masculine" pos="noun">Љуба</source><result>Љубо</result></test>
<test><source case="genitive" number="plural" gender="feminine" pos="noun">пратња</source><result>пратњи</result></test>
<test><source case="genitive" number="plural" gender="feminine" pos="noun">радња</source><result>радњи</result></test>
<test><source case="genitive" number="plural" gender="feminine" pos="noun">лопта</source><result>лопти</result></test>
<test><source case="genitive" number="plural" gender="feminine" pos="noun">молба</source><result>молби</result></test>
<test><source case="genitive" number="plural" gender="feminine" pos="noun">конзерва</source><result>конзерви</result></test>
<test><source case="genitive" number="plural" gender="feminine" pos="noun">гошћа</source><result>гошћа</result></test>
<test><source case="genitive" number="plural" gender="feminine" pos="noun">двојка</source><result>двојака</result></test>
<test><source case="genitive" number="plural" gender="feminine" pos="noun">битка</source><result>битака</result></test>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have a lot of fully fleshed out constraints. Most of the other languages only change specific grammemes. Sometimes you only specify the case, number or gender. The other tests usually specify less. The other languages usually default to noun. These tests are currently fine, but common usage starts from any surface form (ideally a unique surface form), and then you modify just the relevant grammemes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can probably remove noun info, but I do need a case, gender and number for rule based approach to work.
I also assume nominative input (lemma) - otherwise the rules would be more complex, or would need dictionary support to implement them.

<!-- There are some exception, like pripovetka where tk -> dak because of the base word. This has to be dictionary exception -->
<!-- <test><source case="genitive" number="plural" gender="feminine" pos="noun">приповетка</source><result>приповедака</result></test> -->
</inflectionTest>
Loading