11/*
2- * Copyright 2025 Unicode Incorporated and others. All rights reserved.
3- */
2+ * Copyright 2025 Unicode Incorporated and others. All rights reserved.
3+ */
44#include < inflection/grammar/synthesis/MlGrammarSynthesizer_MlDisplayFunction.hpp>
55#include < inflection/dialog/SemanticFeature.hpp>
66#include < inflection/dialog/SemanticFeatureModel_DisplayData.hpp>
1111#include < inflection/grammar/synthesis/GrammemeConstants.hpp>
1212#include < inflection/grammar/synthesis/GrammarSynthesizerUtil.hpp>
1313#include < inflection/grammar/synthesis/MlGrammarSynthesizer.hpp>
14+ #include < inflection/tokenizer/Tokenizer.hpp>
15+ #include < inflection/tokenizer/TokenChain.hpp>
16+ #include < inflection/tokenizer/TokenizerFactory.hpp>
17+ #include < inflection/util/StringViewUtils.hpp>
18+ #include < inflection/util/Validate.hpp>
19+ #include < inflection/util/UnicodeSetUtils.hpp>
20+ #include < inflection/lang/StringFilterUtil.hpp>
21+ #include < icu4cxx/UnicodeSet.hpp>
22+ #include < unicode/uchar.h>
1423#include < inflection/npc.hpp>
1524#include < memory>
1625
@@ -19,103 +28,103 @@ namespace inflection::grammar::synthesis {
1928// Helper function to get string feature by name from the feature map
2029static std::u16string getStrFeature (
2130 const std::u16string& name,
22- const std::map<dialog::SemanticFeature, std::u16string>& features)
31+ const std::map<dialog::SemanticFeature, std::u16string>& features)
2332{
2433 for (const auto & [key, value] : features) {
25- if (key.getName () == name) {
34+ if (key.getName () == name) {
2635 return value;
2736 }
2837 }
2938 return u" " ;
3039}
3140
3241// Changed parameter type from FeatureSet to map<SemanticFeature,u16string>
33- std::u16string fallbackMalayalamPronoun (const std::map<dialog::SemanticFeature, std::u16string>& features) {
42+ std::u16string fallbackMalayalamPronoun (
43+ const std::map<dialog::SemanticFeature, std::u16string>& features)
44+ {
3445 auto person = getStrFeature (u" person" , features);
3546 auto number = getStrFeature (u" number" , features);
3647 auto case_ = getStrFeature (u" case" , features);
3748 auto gender = getStrFeature (u" gender" , features);
3849 auto clusivity = getStrFeature (u" clusivity" , features);
3950 auto formality = getStrFeature (u" formality" , features);
4051
41- // STRICT MINIMUM CHECKS: No fallback unless these are defined
4252 if (person.empty () || number.empty () || case_.empty ()) {
4353 return u" " ;
4454 }
45-
46- // Only fallback for known safe combinations (e.g., 1st person, no gender)
47- // You can later extend this to add valid known forms (e.g., ഞാൻ, നീ, etc.)
4855 if (person == u" first" && number == u" singular" && case_ == u" dative" ) {
4956 return u" എനിക്ക്" ;
5057 }
51-
52- // Optional: Add more valid fallback patterns here if you have confidence
53- // Otherwise: return empty
5458 return u" " ;
5559}
5660
5761using dialog::SemanticFeature;
5862using dialog::SemanticFeatureModel_DisplayData;
5963using dialog::DisplayValue;
6064
61- // Malayalam-specific grammemes
6265static constexpr auto CASE_NOMINATIVE = u" nominative" ;
6366static constexpr auto CASE_ACCUSATIVE = u" accusative" ;
64- static constexpr auto CASE_DATIVE = u" dative" ;
65- static constexpr auto CASE_GENITIVE = u" genitive" ;
67+ static constexpr auto CASE_DATIVE = u" dative" ;
68+ static constexpr auto CASE_GENITIVE = u" genitive" ;
6669static constexpr auto CASE_INSTRUMENTAL = u" instrumental" ;
67- static constexpr auto CASE_LOCATIVE = u" locative" ;
68- static constexpr auto CASE_SOCIATIVE = u" sociative" ;
70+ static constexpr auto CASE_LOCATIVE = u" locative" ;
71+ static constexpr auto CASE_SOCIATIVE = u" sociative" ;
72+
6973static constexpr auto NUMBER_SINGULAR = u" singular" ;
70- static constexpr auto NUMBER_PLURAL = u" plural" ;
74+ static constexpr auto NUMBER_PLURAL = u" plural" ;
75+
7176static constexpr auto GENDER_MASCULINE = u" masculine" ;
72- static constexpr auto GENDER_FEMININE = u" feminine" ;
73- static constexpr auto GENDER_NEUTER = u" neuter" ;
74- static constexpr auto FORMALITY_FORMAL = u" formal" ;
77+ static constexpr auto GENDER_FEMININE = u" feminine" ;
78+ static constexpr auto GENDER_NEUTER = u" neuter" ;
79+
80+ static constexpr auto FORMALITY_FORMAL = u" formal" ;
7581static constexpr auto FORMALITY_INFORMAL = u" informal" ;
76- static constexpr auto CLUSIVITY_INCLUSIVE = u" inclusive" ;
77- static constexpr auto CLUSIVITY_EXCLUSIVE = u" exclusive" ;
82+
83+ static constexpr auto CLUSIVITY_INCLUSIVE = u" inclusive" ;
84+ static constexpr auto CLUSIVITY_EXCLUSIVE = u" exclusive" ;
85+
7886static constexpr auto PERSON_FIRST = u" first" ;
7987static constexpr auto PERSON_SECOND = u" second" ;
80- static constexpr auto PERSON_THIRD = u" third" ;
81- static constexpr auto TENSE_PAST = u" past" ;
88+ static constexpr auto PERSON_THIRD = u" third" ;
89+
90+ static constexpr auto TENSE_PAST = u" past" ;
8291static constexpr auto TENSE_PRESENT = u" present" ;
83- static constexpr auto TENSE_FUTURE = u" future" ;
84- static constexpr auto MOOD_INDICATIVE = u" indicative" ;
85- static constexpr auto MOOD_IMPERATIVE = u" imperative" ;
86- static constexpr auto MOOD_SUBJUNCTIVE = u" subjunctive" ;
92+ static constexpr auto TENSE_FUTURE = u" future" ;
93+
94+ static constexpr auto MOOD_INDICATIVE = u" indicative" ;
95+ static constexpr auto MOOD_IMPERATIVE = u" imperative" ;
96+ static constexpr auto MOOD_SUBJUNCTIVE = u" subjunctive" ;
8797
8898MlGrammarSynthesizer_MlDisplayFunction::MlGrammarSynthesizer_MlDisplayFunction (
8999 const ::inflection::dialog::SemanticFeatureModel& model)
90- : caseFeature(*npc (model.getFeature(GrammemeConstants::CASE)))
91- , numberFeature(*npc (model.getFeature(GrammemeConstants::NUMBER)))
92- , genderFeature(*npc (model.getFeature(GrammemeConstants::GENDER)))
93- , posFeature(*npc (model.getFeature(GrammemeConstants::POS)))
94- , formalityFeature(*npc (model.getFeature(u" formality" )))
95- , clusivityFeature(*npc (model.getFeature(u" clusivity" )))
96- , personFeature(*npc (model.getFeature(GrammemeConstants::PERSON)))
97- , tenseFeature(*npc (model.getFeature(u" tense" )))
98- , moodFeature(*npc (model.getFeature(u" mood" )))
99- , pronounTypeFeature(*npc (model.getFeature(u" pronounType" )))
100- , determinationFeature(*npc (model.getFeature(u" determination" )))
101- , dictionaryInflector(
102- util::LocaleUtils::MALAYALAM (),
103- {
104- {GrammemeConstants::POS_NOUN (), GrammemeConstants::POS_ADJECTIVE (), GrammemeConstants::POS_VERB ()},
105- {CASE_NOMINATIVE, CASE_ACCUSATIVE, CASE_DATIVE, CASE_GENITIVE, CASE_LOCATIVE, CASE_INSTRUMENTAL, CASE_SOCIATIVE},
106- {NUMBER_SINGULAR, NUMBER_PLURAL},
107- {GENDER_MASCULINE, GENDER_FEMININE, GENDER_NEUTER},
108- {FORMALITY_FORMAL, FORMALITY_INFORMAL},
109- {CLUSIVITY_INCLUSIVE, CLUSIVITY_EXCLUSIVE},
110- {PERSON_FIRST, PERSON_SECOND, PERSON_THIRD},
111- {TENSE_PAST, TENSE_PRESENT, TENSE_FUTURE},
112- {MOOD_INDICATIVE, MOOD_IMPERATIVE, MOOD_SUBJUNCTIVE}
113- },
114- {},
115- true )
116- {
117- // Constructor initializes feature references and dictionary inflector
118- }
100+ : caseFeature(*npc (model.getFeature(GrammemeConstants::CASE))),
101+ numberFeature(*npc (model.getFeature(GrammemeConstants::NUMBER))),
102+ genderFeature(*npc (model.getFeature(GrammemeConstants::GENDER))),
103+ posFeature(*npc (model.getFeature(GrammemeConstants::POS))),
104+ formalityFeature(*npc (model.getFeature(u" formality" ))),
105+ clusivityFeature(*npc (model.getFeature(u" clusivity" ))),
106+ personFeature(*npc (model.getFeature(GrammemeConstants::PERSON))),
107+ tenseFeature(*npc (model.getFeature(u" tense" ))),
108+ moodFeature(*npc (model.getFeature(u" mood" ))),
109+ pronounTypeFeature(*npc (model.getFeature(u" pronounType" ))),
110+ determinationFeature(*npc (model.getFeature(u" determination" ))),
111+ dictionaryInflector(
112+ util::LocaleUtils::MALAYALAM (),
113+ {
114+ {GrammemeConstants::POS_NOUN (), GrammemeConstants::POS_VERB ()},
115+ {CASE_NOMINATIVE, CASE_ACCUSATIVE, CASE_DATIVE, CASE_GENITIVE,
116+ CASE_LOCATIVE, CASE_INSTRUMENTAL, CASE_SOCIATIVE},
117+ {NUMBER_SINGULAR, NUMBER_PLURAL},
118+ {GENDER_MASCULINE, GENDER_FEMININE, GENDER_NEUTER},
119+ {FORMALITY_FORMAL, FORMALITY_INFORMAL},
120+ {CLUSIVITY_INCLUSIVE, CLUSIVITY_EXCLUSIVE},
121+ {PERSON_FIRST, PERSON_SECOND, PERSON_THIRD},
122+ {TENSE_PAST, TENSE_PRESENT, TENSE_FUTURE},
123+ {MOOD_INDICATIVE, MOOD_IMPERATIVE, MOOD_SUBJUNCTIVE}
124+ },
125+ {},
126+ true )
127+ {}
119128
120129static std::u16string guessPluralForm (const std::u16string& token) {
121130 if (token.ends_with (u" ം" )) {
@@ -127,10 +136,7 @@ static std::u16string guessPluralForm(const std::u16string& token) {
127136 if (token.ends_with (u" ൻ" )) {
128137 return token + u" മാർ" ;
129138 }
130- if (token.ends_with (u" ി" )) {
131- return token + u" കൾ" ;
132- }
133- if (token.ends_with (u" ാ" )) {
139+ if (token.ends_with (u" ി" ) || token.ends_with (u" ാ" )) {
134140 return token + u" കൾ" ;
135141 }
136142 if (!token.empty () && token.back () != u' ്' ) {
@@ -142,31 +148,20 @@ static std::u16string guessPluralForm(const std::u16string& token) {
142148::inflection::dialog::DisplayValue* MlGrammarSynthesizer_MlDisplayFunction::getDisplayValue (
143149 const SemanticFeatureModel_DisplayData& displayData,
144150 const std::map<SemanticFeature, std::u16string>& constraints,
145- bool enableInflectionGuess) const
151+ bool enableInflectionGuess) const
146152{
147153 const auto displayValue = GrammarSynthesizerUtil::getTheBestDisplayValue (displayData, constraints);
148- if (displayValue == nullptr ) {
149- if (GrammarSynthesizerUtil::getFeatureValue (constraints, posFeature) == u" pronoun" ) {
150- std::u16string fallback = fallbackMalayalamPronoun (constraints);
151- if (!fallback.empty ()) {
152- return new DisplayValue (fallback, constraints);
154+ if (displayValue == nullptr || displayValue->getDisplayString ().empty ()) {
155+ if (GrammarSynthesizerUtil::getFeatureValue (constraints, posFeature) == u" pronoun" ) {
156+ std::u16string fallback = fallbackMalayalamPronoun (constraints);
157+ if (!fallback.empty ()) {
158+ return new DisplayValue (fallback, constraints);
159+ }
153160 }
161+ return nullptr ;
154162 }
155- return nullptr ;
156- }
157163
158164 const std::u16string baseForm = displayValue->getDisplayString ();
159-
160- if (baseForm.empty ()) {
161- if (GrammarSynthesizerUtil::getFeatureValue (constraints, posFeature) == u" pronoun" ) {
162- std::u16string fallback = fallbackMalayalamPronoun (constraints);
163- if (!fallback.empty ()) {
164- return new DisplayValue (fallback, constraints);
165- }
166- }
167- return nullptr ;
168- }
169-
170165 const std::u16string posFeatureValue = GrammarSynthesizerUtil::getFeatureValue (constraints, posFeature);
171166 const std::u16string numberFeatureValue = GrammarSynthesizerUtil::getFeatureValue (constraints, numberFeature);
172167 const std::u16string caseValue = GrammarSynthesizerUtil::getFeatureValue (constraints, caseFeature);
@@ -220,7 +215,7 @@ if (displayValue == nullptr) {
220215 constraintValues.push_back (posFeatureValue);
221216 }
222217
223- if (posFeatureValue == u" adjective " || posFeatureValue == GrammemeConstants::POS_PRONOUN ()) {
218+ if (posFeatureValue == GrammemeConstants::POS_PRONOUN ()) {
224219 addIfNotEmpty (genderFeature);
225220 }
226221
@@ -239,8 +234,8 @@ if (displayValue == nullptr) {
239234 }
240235
241236 auto inflectedOpt = dictionaryInflector.inflect (baseForm, wordGrammemes, constraintValues);
242- if (inflectedOpt.has_value () && *inflectedOpt != baseForm) {
243237
238+ if (inflectedOpt.has_value () && *inflectedOpt != baseForm) {
244239 std::u16string result = *inflectedOpt;
245240
246241 if (std::find (constraintValues.begin (), constraintValues.end (), u" first" ) != constraintValues.end () &&
@@ -258,7 +253,6 @@ if (displayValue == nullptr) {
258253 (result == u" നാം" || baseForm == u" നാം" )) {
259254 return new DisplayValue (u" നമ്മൾ" , constraints);
260255 }
261-
262256 return new DisplayValue (result, constraints);
263257 }
264258
@@ -275,13 +269,18 @@ if (displayValue == nullptr) {
275269
276270 if (posFeatureValue == u" verb" ) {
277271 std::u16string stem = baseForm;
278- if (stem.size () >= 3 && stem.compare (stem.size () - 3 , 3 , u" ക്കുക" ) == 0 ) {
279- stem = stem.substr (0 , stem.size () - 3 );
280- } else if (stem.size () >= 2 && stem.compare (stem.size () - 2 , 2 , u" കുക" ) == 0 ) {
281- stem = stem.substr (0 , stem.size () - 2 );
272+ static const std::vector<std::u16string> infinitiveSuffixes = {u" ക്കുക" , u" കുക" , u" വിക്കുക" , u" പിക്കുക" };
273+
274+ for (const auto & suffix : infinitiveSuffixes) {
275+ if (stem.size () >= suffix.size () &&
276+ stem.compare (stem.size () - suffix.size (), suffix.size (), suffix) == 0 ) {
277+ stem = stem.substr (0 , stem.size () - suffix.size ());
278+ break ;
279+ }
282280 }
283281
284282 std::u16string conjugatedVerb;
283+
285284 if (moodVal == MOOD_INDICATIVE) {
286285 if (tenseVal == TENSE_PRESENT) {
287286 conjugatedVerb = stem + u" ിക്കുന്നു" ;
@@ -291,9 +290,9 @@ if (displayValue == nullptr) {
291290 conjugatedVerb = stem + u" ിക്കും" ;
292291 }
293292 } else if (moodVal == MOOD_IMPERATIVE) {
294- conjugatedVerb = stem + u" ുക " ;
293+ conjugatedVerb = baseForm ;
295294 } else if (moodVal == MOOD_SUBJUNCTIVE) {
296- conjugatedVerb = stem + u" മെന്ന് " ;
295+ conjugatedVerb = stem + u" ക്കുമെന്ന് " ;
297296 }
298297
299298 if (!conjugatedVerb.empty ()) {
@@ -303,6 +302,7 @@ if (displayValue == nullptr) {
303302
304303 if (!caseValue.empty ()) {
305304 std::u16string result;
305+
306306 if (caseValue == CASE_ACCUSATIVE) {
307307 if (baseForm.ends_with (u" ൻ" )) {
308308 result = baseForm.substr (0 , baseForm.size () - 1 ) + u" നെ" ;
@@ -329,16 +329,16 @@ if (displayValue == nullptr) {
329329 }
330330 }
331331
332- // Move this outside so it always runs last
333332 if (posFeatureValue == u" pronoun" ) {
334333 std::u16string fallback = fallbackMalayalamPronoun (constraints);
335334 if (!fallback.empty ()) {
336335 return new DisplayValue (fallback, constraints);
337336 }
338- }
337+ }
338+
339339 return nullptr ;
340340}
341341
342342MlGrammarSynthesizer_MlDisplayFunction::~MlGrammarSynthesizer_MlDisplayFunction () = default ;
343343
344- } // namespace inflection::grammar::synthesis
344+ } // namespace inflection::grammar::synthesis
0 commit comments