Skip to content

Commit d11e89e

Browse files
committed
Changed files based on comments
1 parent 5b80c77 commit d11e89e

File tree

15 files changed

+540
-446
lines changed

15 files changed

+540
-446
lines changed

documents/how_to_add_new_language.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ The following steps with help you identify files that need to be added or change
1010
NOTE: Take a look at [PR #40](https://github.com/unicode-org/inflection/pull/40) and [PR #111](https://github.com/unicode-org/inflection/pull/111) for example on how to add initial language support based on dictionary lookup only.
1111
In general, to bootstrap your progress look for grammatically similar language that's already supported, e.g. if you are adding Serbian look for existing Russian implementation.
1212
This will help you find most of the files you need to add/change and will speed up implementation of the rules and lexicons.
13-
We recommend you spend around a week researching the language and all the different components of the language before even beginning to modify and add the files below. Look at all the files in the project such as tokenizers, configuration files, grammar files, and different lookup functions to see what you need. This will save you a lot of time in the end. We highly suggest you stray away from hardcoded logic and rely on the Dictionary Lookup. Look at all the grammemes, tokenizer logic, multi-word phrase handling
13+
We recommend you spend around a week researching the language and all the different components of the language before even beginning to modify and add the files below. Look at all the files in the project such as tokenizers, configuration files, grammar files, and different lookup functions to see what you need. This will save you a lot of time in the end. We highly suggest you stray away from hardcoded logic and rely on the Dictionary Lookup. Look at all the grammemes, tokenizer logic, and multi-word phrase handling.
1414

1515
Before you add new language support, go to the README.md in the inflection subfolder (inflection/inflection/README.md), build the project, and make sure all the tests run on your computer.
1616

inflection/resources/org/unicode/inflection/dictionary/.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dictionary_he.lst filter=lfs diff=lfs merge=lfs -text
88
dictionary_hi.lst filter=lfs diff=lfs merge=lfs -text
99
dictionary_it.lst filter=lfs diff=lfs merge=lfs -text
1010
dictionary_ko.lst filter=lfs diff=lfs merge=lfs -text
11+
dictionary_ml.lst filter=lfs diff=lfs merge=lfs -text
1112
dictionary_nb.lst filter=lfs diff=lfs merge=lfs -text
1213
dictionary_nl.lst filter=lfs diff=lfs merge=lfs -text
1314
dictionary_pt.lst filter=lfs diff=lfs merge=lfs -text
@@ -23,6 +24,7 @@ inflectional_fr.xml filter=lfs diff=lfs merge=lfs -text
2324
inflectional_he.xml filter=lfs diff=lfs merge=lfs -text
2425
inflectional_hi.xml filter=lfs diff=lfs merge=lfs -text
2526
inflectional_it.xml filter=lfs diff=lfs merge=lfs -text
27+
inflectional_ml.xml filter=lfs diff=lfs merge=lfs -text
2628
inflectional_nb.xml filter=lfs diff=lfs merge=lfs -text
2729
inflectional_nl.xml filter=lfs diff=lfs merge=lfs -text
2830
inflectional_pt.xml filter=lfs diff=lfs merge=lfs -text

inflection/resources/org/unicode/inflection/tokenizer/config_ml.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@
33
#
44
tokenizer.implementation.class=DefaultTokenizer
55
tokenizer.nonDecompound.file=/org/unicode/inflection/tokenizer/ml/nondecompound.tok
6-
tokenizer.decompound=^(ശ്രീ)(.+?)(ഗുരു|സര്‍ക്കാര്‍)$|^(.+?)(ഗുരു|സര്‍ക്കാര്‍)$|^(.+?)(ഉണ്ട്|ആണ്|ഇല്ല)$|^(.+?)(ഒടൊപ്പം|ഉടൻ|ഓടെ|ഓട്|ഒപ്പം|തന്നെ|പോലും|പോലെ|ഉം|യ്)$|^(.+?)(കളുടെ|ങ്ങളുടെ|ത്തിന്റെ|ൻ്റെ|ന്റെ|യുടേ|യുടെ|യാൽ|യിൽ|ഇൽ|ല്|ൽ|ക്ക്|മാർ|ങ്ങൾ|കൾ|നെ|യെ)$
6+
tokenizer.decompound=(ശ്രീ)(.+?)(ഗുരു|സര്‍ക്കാര്‍)$|^(.+?)(ഗുരു|സര്‍ക്കാര്‍)$|^(.+?)(ഉണ്ട്|ആണ്|ഇല്ല)$|^(.+?)(ഒടൊപ്പം|ഉടൻ|ഓടെ|ഓട്|ഒപ്പം|തന്നെ|പോലും|പോലെ|ഉം|യ്)$|^(.+?)(കളുടെ|ങ്ങളുടെ|ത്തിന്റെ|ൻ്റെ|ന്റെ|യുടേ|യുടെ|യാൽ|യിൽ|ഇൽ|ല്|ൽ|ക്ക്|മാർ|ങ്ങൾ|കൾ|നെ|യെ)
77

inflection/src/inflection/dialog/language/MlCommonConceptFactory.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,6 @@
1010

1111
namespace inflection::dialog::language {
1212

13-
MlCommonConceptFactory::MlCommonConceptFactory(const ::inflection::util::ULocale& language)
14-
: super(language)
15-
{
16-
}
17-
18-
MlCommonConceptFactory::~MlCommonConceptFactory()
19-
{
20-
}
21-
2213
// Malayalam-specific conjunction for OR
2314
::inflection::dialog::SemanticConceptList* MlCommonConceptFactory::createOrList(
2415
const ::std::vector<const ::inflection::dialog::SemanticFeatureConceptBase*>& concepts) const
Lines changed: 139 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,157 @@
11
/*
2-
* Copyright 2025 Apple Inc. All rights reserved.
2+
* Copyright 2025 Unicode Incorporated and others. All rights reserved.
33
*/
4-
#include <inflection/grammar/synthesis/MlGrammarSynthesizer.hpp>
54

5+
#include <inflection/grammar/synthesis/MlGrammarSynthesizer.hpp>
66
#include <inflection/dialog/SemanticFeatureModel.hpp>
77
#include <inflection/grammar/synthesis/MlGrammarSynthesizer_NumberLookupFunction.hpp>
88
#include <inflection/grammar/synthesis/MlGrammarSynthesizer_GenderLookupFunction.hpp>
99
#include <inflection/grammar/synthesis/MlGrammarSynthesizer_CaseLookupFunction.hpp>
1010
#include <inflection/grammar/synthesis/MlGrammarSynthesizer_MlDisplayFunction.hpp>
1111
#include <inflection/grammar/synthesis/GrammemeConstants.hpp>
12+
#include <inflection/npc.hpp>
13+
#include <map>
1214

1315
namespace inflection::grammar::synthesis {
1416

17+
static constexpr auto MOOD_SUBJUNCTIVE = u"subjunctive";
18+
1519
void MlGrammarSynthesizer::addSemanticFeatures(::inflection::dialog::SemanticFeatureModel& featureModel)
1620
{
17-
featureModel.putDefaultFeatureFunctionByName(GrammemeConstants::NUMBER, new MlGrammarSynthesizer_NumberLookupFunction());
18-
featureModel.putDefaultFeatureFunctionByName(GrammemeConstants::GENDER, new MlGrammarSynthesizer_GenderLookupFunction());
19-
featureModel.putDefaultFeatureFunctionByName(GrammemeConstants::CASE, new MlGrammarSynthesizer_CaseLookupFunction());
20-
21+
featureModel.putDefaultFeatureFunctionByName(GrammemeConstants::NUMBER,
22+
new MlGrammarSynthesizer_NumberLookupFunction());
23+
featureModel.putDefaultFeatureFunctionByName(GrammemeConstants::GENDER,
24+
new MlGrammarSynthesizer_GenderLookupFunction());
25+
featureModel.putDefaultFeatureFunctionByName(GrammemeConstants::CASE,
26+
new MlGrammarSynthesizer_CaseLookupFunction());
27+
2128
featureModel.setDefaultDisplayFunction(new MlGrammarSynthesizer_MlDisplayFunction(featureModel));
2229
}
2330

24-
} // namespace inflection::grammar::synthesis
31+
MlGrammarSynthesizer::Number MlGrammarSynthesizer::getNumber(const ::std::u16string* value) {
32+
if (!value) return Number::undefined;
33+
if (*value == GrammemeConstants::NUMBER_SINGULAR()) return Number::singular;
34+
if (*value == GrammemeConstants::NUMBER_PLURAL()) return Number::plural;
35+
return Number::undefined;
36+
}
37+
38+
MlGrammarSynthesizer::Case MlGrammarSynthesizer::getCase(const ::std::u16string* value) {
39+
if (!value) return Case::undefined;
40+
if (*value == GrammemeConstants::CASE_NOMINATIVE()) return Case::nominative;
41+
if (*value == GrammemeConstants::CASE_ACCUSATIVE()) return Case::accusative;
42+
if (*value == GrammemeConstants::CASE_DATIVE()) return Case::dative;
43+
if (*value == GrammemeConstants::CASE_GENITIVE()) return Case::genitive;
44+
if (*value == GrammemeConstants::CASE_INSTRUMENTAL()) return Case::instrumental;
45+
if (*value == GrammemeConstants::CASE_LOCATIVE()) return Case::locative;
46+
return Case::undefined;
47+
}
48+
49+
MlGrammarSynthesizer::Person MlGrammarSynthesizer::getPerson(const ::std::u16string* value) {
50+
if (!value) return Person::undefined;
51+
if (*value == GrammemeConstants::PERSON_FIRST()) return Person::first;
52+
if (*value == GrammemeConstants::PERSON_SECOND()) return Person::second;
53+
if (*value == GrammemeConstants::PERSON_THIRD()) return Person::third;
54+
return Person::undefined;
55+
}
2556

57+
MlGrammarSynthesizer::Tense MlGrammarSynthesizer::getTense(const ::std::u16string* value) {
58+
if (!value) return Tense::undefined;
59+
if (*value == GrammemeConstants::TENSE_PAST()) return Tense::past;
60+
if (*value == GrammemeConstants::TENSE_PRESENT()) return Tense::present;
61+
if (*value == GrammemeConstants::TENSE_FUTURE()) return Tense::future;
62+
return Tense::undefined;
63+
}
64+
65+
MlGrammarSynthesizer::Mood MlGrammarSynthesizer::getMood(const ::std::u16string* value) {
66+
if (!value) return Mood::undefined;
67+
if (*value == GrammemeConstants::MOOD_INDICATIVE()) return Mood::indicative;
68+
if (*value == GrammemeConstants::MOOD_IMPERATIVE()) return Mood::imperative;
69+
if (*value == MOOD_SUBJUNCTIVE) return Mood::subjunctive;
70+
return Mood::undefined;
71+
}
72+
73+
MlGrammarSynthesizer::LookupKey MlGrammarSynthesizer::makeLookupKey(Number num, Case kase) {
74+
return (static_cast<LookupKey>(kase) & 0xFF)
75+
| ((static_cast<LookupKey>(num) & 0xFF) << 8);
76+
}
77+
78+
MlGrammarSynthesizer::LookupKey MlGrammarSynthesizer::makeVerbLookupKey(Person person, Number num, Tense tense, Mood mood) {
79+
return (static_cast<LookupKey>(person) & 0xFF)
80+
| ((static_cast<LookupKey>(num) & 0xFF) << 8)
81+
| ((static_cast<LookupKey>(tense) & 0x0F) << 24)
82+
| ((static_cast<LookupKey>(mood) & 0x0F) << 28);
83+
}
84+
85+
MlGrammarSynthesizer::Person MlGrammarSynthesizer::personFromConstraint(const ::std::u16string& val) {
86+
return getPerson(&val);
87+
}
88+
89+
MlGrammarSynthesizer::Number MlGrammarSynthesizer::numberFromConstraint(const ::std::u16string& val) {
90+
return getNumber(&val);
91+
}
92+
93+
MlGrammarSynthesizer::Case MlGrammarSynthesizer::caseFromConstraint(const ::std::u16string& val) {
94+
return getCase(&val);
95+
}
96+
97+
MlGrammarSynthesizer::LookupKey MlGrammarSynthesizer::buildVerbSuffixKey(const std::vector<::std::u16string>& constraintValues) {
98+
Person person = Person::undefined;
99+
Number num = Number::undefined;
100+
Tense tense = Tense::undefined;
101+
Mood mood = Mood::undefined;
102+
103+
for (const auto& val : constraintValues) {
104+
if (person == Person::undefined) person = personFromConstraint(val);
105+
if (num == Number::undefined) num = numberFromConstraint(val);
106+
if (tense == Tense::undefined) tense = getTense(&val);
107+
if (mood == Mood::undefined) mood = getMood(&val);
108+
}
109+
110+
return makeVerbLookupKey(person, num, tense, mood);
111+
}
112+
113+
const std::map<MlGrammarSynthesizer::LookupKey, ::std::u16string> MlGrammarSynthesizer::malayalamSuffixMap = {
114+
{makeLookupKey(Number::singular, Case::nominative), u""},
115+
{makeLookupKey(Number::plural, Case::nominative), u"കൾ"},
116+
{makeLookupKey(Number::singular, Case::genitive), u"യുടെ"},
117+
{makeLookupKey(Number::plural, Case::genitive), u"കളുടെ"},
118+
{makeLookupKey(Number::singular, Case::dative), u"ക്ക്"},
119+
{makeLookupKey(Number::plural, Case::dative), u"കൾക്ക്"},
120+
};
121+
122+
const std::map<MlGrammarSynthesizer::LookupKey, ::std::u16string> MlGrammarSynthesizer::malayalamVerbSuffixMap = {
123+
{makeVerbLookupKey(Person::first, Number::singular, Tense::past, Mood::indicative), u"ച്ചു"},
124+
{makeVerbLookupKey(Person::first, Number::plural, Tense::past, Mood::indicative), u"ഞ്ഞു"},
125+
{makeVerbLookupKey(Person::second, Number::singular, Tense::past, Mood::indicative), u"ച്ചു"},
126+
{makeVerbLookupKey(Person::second, Number::plural, Tense::past, Mood::indicative), u"ന്നു"},
127+
{makeVerbLookupKey(Person::third, Number::singular, Tense::past, Mood::indicative), u"ച്ചു"},
128+
{makeVerbLookupKey(Person::third, Number::plural, Tense::past, Mood::indicative), u"ന്നു"},
129+
130+
{makeVerbLookupKey(Person::first, Number::singular, Tense::present, Mood::indicative), u"ിക്കുന്നു"},
131+
{makeVerbLookupKey(Person::first, Number::plural, Tense::present, Mood::indicative), u"ിക്കുന്നു"},
132+
{makeVerbLookupKey(Person::second, Number::singular, Tense::present, Mood::indicative), u"ിക്കുന്നു"},
133+
{makeVerbLookupKey(Person::second, Number::plural, Tense::present, Mood::indicative), u"ിക്കുന്നു"},
134+
{makeVerbLookupKey(Person::third, Number::singular, Tense::present, Mood::indicative), u"ിക്കുന്നു"},
135+
{makeVerbLookupKey(Person::third, Number::plural, Tense::present, Mood::indicative), u"ിക്കുന്നു"},
136+
137+
{makeVerbLookupKey(Person::first, Number::singular, Tense::future, Mood::indicative), u" ചെയ്യും"},
138+
{makeVerbLookupKey(Person::first, Number::plural, Tense::future, Mood::indicative), u" ചെയ്യും"},
139+
{makeVerbLookupKey(Person::second, Number::singular, Tense::future, Mood::indicative), u" ചെയ്യും"},
140+
{makeVerbLookupKey(Person::second, Number::plural, Tense::future, Mood::indicative), u" ചെയ്യും"},
141+
{makeVerbLookupKey(Person::third, Number::singular, Tense::future, Mood::indicative), u" ചെയ്യും"},
142+
{makeVerbLookupKey(Person::third, Number::plural, Tense::future, Mood::indicative), u" ചെയ്യും"},
143+
};
144+
145+
const std::u16string& MlGrammarSynthesizer::getSuffix(LookupKey key) {
146+
static const std::u16string empty = u"";
147+
auto it = malayalamSuffixMap.find(key);
148+
return it != malayalamSuffixMap.end() ? it->second : empty;
149+
}
150+
151+
const std::u16string& MlGrammarSynthesizer::getVerbSuffix(LookupKey key) {
152+
static const std::u16string empty = u"";
153+
auto it = malayalamVerbSuffixMap.find(key);
154+
return it != malayalamVerbSuffixMap.end() ? it->second : empty;
155+
}
156+
157+
} // namespace inflection::grammar::synthesis
Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,86 @@
11
/*
2-
* Copyright 2025 Apple Inc. All rights reserved.
3-
*/
2+
* Copyright 2025 Unicode Incorporated and others. All rights reserved.
3+
*/
44
#pragma once
55

66
#include <inflection/dialog/fwd.hpp>
77
#include <inflection/grammar/synthesis/fwd.hpp>
8+
#include <cstdint>
89
#include <string>
10+
#include <vector>
11+
#include <map>
912

1013
class inflection::grammar::synthesis::MlGrammarSynthesizer final
1114
{
1215
public:
13-
static void addSemanticFeatures(::inflection::dialog::SemanticFeatureModel& featureModel);
16+
static void addSemanticFeatures(::inflection::dialog::SemanticFeatureModel& model);
17+
18+
enum class Number {
19+
undefined,
20+
singular,
21+
plural
22+
};
23+
static Number getNumber(const ::std::u16string* value);
24+
25+
enum class Case {
26+
undefined,
27+
nominative,
28+
accusative,
29+
dative,
30+
genitive,
31+
instrumental,
32+
locative
33+
};
34+
static Case getCase(const ::std::u16string* value);
35+
36+
enum class Person {
37+
undefined,
38+
first,
39+
second,
40+
third
41+
};
42+
static Person getPerson(const ::std::u16string* value);
43+
44+
enum class Tense {
45+
undefined,
46+
past,
47+
present,
48+
future
49+
};
50+
static Tense getTense(const ::std::u16string* value);
51+
52+
enum class Mood {
53+
undefined,
54+
indicative,
55+
imperative,
56+
subjunctive
57+
};
58+
static Mood getMood(const ::std::u16string* value);
59+
60+
typedef uint32_t LookupKey;
61+
static LookupKey makeLookupKey(Number num, Case kase);
62+
static LookupKey makeVerbLookupKey(Person person, Number num, Tense tense, Mood mood);
63+
64+
static Person personFromConstraint(const ::std::u16string& val);
65+
static Number numberFromConstraint(const ::std::u16string& val);
66+
static Case caseFromConstraint(const ::std::u16string& val);
67+
68+
static LookupKey buildVerbSuffixKey(const std::vector<::std::u16string>& constraintValues);
69+
70+
static const std::map<LookupKey, ::std::u16string> malayalamSuffixMap;
71+
static const std::map<LookupKey, ::std::u16string> malayalamVerbSuffixMap;
72+
73+
static const ::std::u16string& getSuffix(LookupKey key);
74+
static const ::std::u16string& getVerbSuffix(LookupKey key);
75+
1476
private:
1577
MlGrammarSynthesizer() = delete;
16-
};
1778

79+
public:
80+
static constexpr auto NOUN_CASE = u"case";
81+
static constexpr auto NOUN_NUMBER = u"number";
82+
static constexpr auto VERB_PERSON = u"person";
83+
static constexpr auto VERB_NUMBER = u"verbNumber";
84+
static constexpr auto VERB_TENSE = u"tense";
85+
static constexpr auto VERB_MOOD = u"mood";
86+
};

0 commit comments

Comments
 (0)