diff --git a/inflection/resources/org/unicode/inflection/dictionary/.gitattributes b/inflection/resources/org/unicode/inflection/dictionary/.gitattributes index 7e2fe358..bff5d4bc 100644 --- a/inflection/resources/org/unicode/inflection/dictionary/.gitattributes +++ b/inflection/resources/org/unicode/inflection/dictionary/.gitattributes @@ -1,2 +1,4 @@ dictionary_da.lst filter=lfs diff=lfs merge=lfs -text +dictionary_en.lst filter=lfs diff=lfs merge=lfs -text inflectional_da.xml filter=lfs diff=lfs merge=lfs -text +inflectional_en.xml filter=lfs diff=lfs merge=lfs -text diff --git a/inflection/resources/org/unicode/inflection/dictionary/dictionary_en.lst b/inflection/resources/org/unicode/inflection/dictionary/dictionary_en.lst index 9d149220..f761f0f9 100644 --- a/inflection/resources/org/unicode/inflection/dictionary/dictionary_en.lst +++ b/inflection/resources/org/unicode/inflection/dictionary/dictionary_en.lst @@ -1,100 +1,3 @@ -Joses: plural proper-noun inflection=2 -Paris: singular plural proper-noun inflection=2 inflection=b -United States: singular proper-noun inflection=8 inflection=2e5 -a: determiner inflection=26 -an: determiner inflection=26 -animal: singular adjective noun inflection=1 -apple: singular adjective noun inflection=1 -apples: plural noun inflection=1 -are: singular plural first second third present noun verb inflection=1b0 inflection=1 -around: adposition -bean: singular plural first second third present infinitive noun verb inflection=1 inflection=3 -beans: singular plural third present noun verb inflection=1 inflection=3 -between: adposition -boy: singular interjection noun inflection=1 -boys: plural noun inflection=1 -cat: singular plural first second third present infinitive noun verb inflection=1 inflection=1c9 -cats: singular plural third present noun verb inflection=1 inflection=1c9 -church: singular plural first second third present infinitive noun verb inflection=7 inflection=9 -churches: singular plural third present noun verb inflection=7 inflection=9 -cities: plural noun inflection=6 -city: singular adjective noun inflection=6 -create: singular plural first second third present infinitive verb inflection=4 -creates: singular third present verb inflection=4 -day: singular noun inflection=1 -days: plural noun inflection=1 -fan: singular plural first second third present infinitive noun verb inflection=1 inflection=1d -fans: singular plural third present noun verb inflection=1 inflection=1d -flock: singular plural first second third present infinitive noun verb inflection=1 inflection=3 -friend: singular plural first second third present infinitive noun verb inflection=1 inflection=3 -friends: singular plural third present noun verb inflection=1 inflection=3 -garbage: singular noun inflection=1 -garbages: plural noun inflection=1 -garden: singular plural first second third present infinitive noun verb inflection=1 inflection=3 -geese: plural noun inflection=f4 -glutei maximi: plural noun inflection=17f -glutei: plural noun inflection=15 -gluteus maximus: singular noun inflection=17f -gluteus: singular noun inflection=15 -good: singular adjective noun inflection=1 -goods: plural noun inflection=1 -goose: singular plural first second third present infinitive noun verb inflection=4 inflection=f4 -has: singular third present verb inflection=fa -have: singular plural first second third present infinitive noun verb inflection=1 inflection=fa -head: singular plural first second third present infinitive adjective noun verb inflection=1 inflection=3 -heads: singular plural third present noun verb inflection=1 inflection=3 -hour: singular vowel-start noun inflection=1 -hours: plural vowel-start noun inflection=1 -houses: singular plural third present noun verb inflection=1 inflection=4 -is: singular third present verb inflection=1b0 -it: singular inanimate accusative nominative third pronoun -kidney: singular noun inflection=1 -leading: singular gerund adjective noun verb inflection=1 inflection=83 -light: singular plural first second third present infinitive adjective noun verb inflection=1 inflection=9e -lights: singular plural third present noun verb inflection=1 inflection=9e -man: singular plural first second third present infinitive interjection noun verb inflection=c inflection=1d -men: plural noun inflection=c -mice: plural noun inflection=63 -mouse: singular plural first second third present infinitive noun verb inflection=4 inflection=63 -noun: singular noun inflection=1 -nouns: plural noun inflection=1 -of: adposition -on: adverb adposition -orange: singular adjective noun inflection=1 -patio: singular noun inflection=1 -patios: plural noun inflection=1 -phrase: singular plural first second third present infinitive noun verb inflection=1 inflection=4 -phrases: singular plural third present noun verb inflection=1 inflection=4 -pie: singular noun inflection=1 -pies: plural noun inflection=1 -plural: singular adjective noun inflection=1 -plurals: plural noun inflection=1 -red: singular adjective noun inflection=1 -sag: singular plural first second third present infinitive noun verb inflection=1 inflection=14 -sags: singular plural third present noun verb inflection=1 inflection=14 -sheep: singular plural noun inflection=a -sister: singular noun inflection=1 inflection=33b -sisters: plural noun inflection=1 inflection=33b -spatula: singular noun inflection=1 -test: singular plural first second third present infinitive noun verb inflection=1 inflection=3 -tests: singular plural third present noun verb inflection=1 inflection=3 -that: singular inanimate demonstrative determiner pronoun inflection=18b -the: determiner inflection=26 -theories: plural noun inflection=6 -theory: singular noun inflection=6 -these: plural inanimate demonstrative determiner pronoun inflection=15d -thesis: singular noun inflection=10 -this: singular inanimate demonstrative determiner pronoun inflection=15d -those: plural inanimate demonstrative determiner pronoun inflection=18b -to: adposition -truss: singular plural first second third present infinitive noun verb inflection=7 inflection=9 -trusses: singular plural third present noun verb inflection=7 inflection=9 -umbrella: singular noun inflection=1 -unicorn: singular consonant-start noun inflection=1 -word: singular plural first second third present infinitive noun verb inflection=1 inflection=3 -work: singular plural first second third present infinitive noun verb inflection=1 inflection=3 -works: singular plural third present noun verb inflection=1 inflection=3 -yen: singular plural noun inflection=a -============================================== -Manually curated for tests to pass -Copyright 2024-2024 Apple Inc. All rights reserved. +version https://git-lfs.github.com/spec/v1 +oid sha256:7b78d5a694d0d6e301a530687fb4bd017563a3a30e2f60c1decd4cc8e06c413f +size 4946201 diff --git a/inflection/resources/org/unicode/inflection/dictionary/inflectional_en.xml b/inflection/resources/org/unicode/inflection/dictionary/inflectional_en.xml index ea52e2df..e5e439e8 100644 --- a/inflection/resources/org/unicode/inflection/dictionary/inflectional_en.xml +++ b/inflection/resources/org/unicode/inflection/dictionary/inflectional_en.xml @@ -1,324 +1,3 @@ - - - - - noun - - - - s - - - - proper-noun - - - - s - - - - verb - - - - - s - - - - ed - ed - ed - - ing - - - - verb - e - - e - e - es - e - e - e - ed - ed - ed - e - ing - - - - noun - y - - y - ies - - - - noun - - - - es - - - - proper-noun - - - - - - - verb - - - - - es - - - - ed - ed - ed - - ing - - - - noun - - - - - - - - proper-noun - - - - es - - - - noun - an - - an - en - - - - noun - is - - is - es - - - - verb - - - - - s - - - - ged - ged - ged - - ging - - - - noun - us - - us - i - - - - verb - - - - - s - - - - ned - ned - ned - - ning - - - - determiner - - - noun - ouse - - ouse - ice - - - - verb - ad - - ad - ad - ads - ad - ad - ad - d - d - d - ad - ading - - - - verb - ght - - ght - ght - ghts - ght - ght - ght - t - t - t - ght - ghting - - - - noun - oose - - oose - eese - - - - verb - ve - - ve - ve - s - ve - ve - ve - d - d - d - ve - ving - st - th - st - st - st - - - - determiner - is - - is - ese - - - - noun - us maximus - - us maximus - i maximi - - - - determiner - at - - at - ose - - - - verb - be - - am - are - is - are - are - are - was - wast - were - been - be - being - wert - - - - verb - - - - - s - - - - ted - ted - ted - - tin' - ting - - - - proper-noun - of America - - - of America - of Americas - - - - noun - er - - a - er - ers - - - +version https://git-lfs.github.com/spec/v1 +oid sha256:e7aab928cb118a6581ed6e0908a00d782ad565cfb539734baa9e37c0e22a797f +size 516358 diff --git a/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.cpp b/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.cpp index f421b4c6..982f875d 100644 --- a/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.cpp +++ b/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.cpp @@ -1,4 +1,5 @@ /* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. * Copyright 2017-2024 Apple Inc. All rights reserved. */ #include @@ -86,7 +87,8 @@ ::inflection::dialog::DisplayValue* EnGrammarSynthesizer_EnDisplayFunction::getD } auto displayValueConstraints(constraints); ::std::u16string countString = GrammarSynthesizerUtil::getFeatureValue(constraints, countFeature); - if (countString == GrammemeConstants::NUMBER_PLURAL() || countString == GrammemeConstants::NUMBER_SINGULAR()) { + bool isRequestingPlural = countString == GrammemeConstants::NUMBER_PLURAL(); + if (isRequestingPlural || countString == GrammemeConstants::NUMBER_SINGULAR()) { auto result = inflectPhrase(displayString, constraints, enableInflectionGuess); if (!result && !enableInflectionGuess) { return nullptr; @@ -102,7 +104,7 @@ ::inflection::dialog::DisplayValue* EnGrammarSynthesizer_EnDisplayFunction::getD ::std::u16string caseString = GrammarSynthesizerUtil::getFeatureValue(constraints, caseFeature); if (caseString == GrammemeConstants::CASE_GENITIVE()) { - displayString = inflectPossessive(displayString, displayValueConstraints); + displayString = inflectPossessive(displayString, displayValueConstraints, isRequestingPlural); } return definitenessDisplayFunction.addDefiniteness(new ::inflection::dialog::DisplayValue(displayString, displayValueConstraints), constraints); @@ -200,7 +202,7 @@ ::std::u16string EnGrammarSynthesizer_EnDisplayFunction::guessSingularInflection return displayString; } -::std::u16string EnGrammarSynthesizer_EnDisplayFunction::inflectPossessive(const ::std::u16string& displayString, ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& valueConstraints) const +::std::u16string EnGrammarSynthesizer_EnDisplayFunction::inflectPossessive(const ::std::u16string& displayString, ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& valueConstraints, bool isRequestingPlural) const { ::std::u16string lowercase; ::inflection::util::StringViewUtils::lowercase(&lowercase, displayString, ::inflection::util::LocaleUtils::ENGLISH()); @@ -218,7 +220,7 @@ ::std::u16string EnGrammarSynthesizer_EnDisplayFunction::inflectPossessive(const else { if (::inflection::util::StringViewUtils::endsWith(suffix, u"s")) { ::std::unique_ptr<::inflection::tokenizer::TokenChain> tokenChain(npc(npc(tokenizer.get())->createTokenChain(displayString))); - if (dictionary.hasAllProperties(npc(npc(tokenChain->getTail())->getPrevious())->getValue(), pluralProperty)) { + if (isRequestingPlural || dictionary.hasAllProperties(npc(npc(tokenChain->getTail())->getPrevious())->getValue(), pluralProperty)) { suffixStr = u"’"; } } diff --git a/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.hpp b/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.hpp index 771d99ac..f70a0af2 100644 --- a/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.hpp +++ b/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.hpp @@ -1,4 +1,5 @@ /* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. * Copyright 2017-2024 Apple Inc. All rights reserved. */ #pragma once @@ -42,7 +43,7 @@ class inflection::grammar::synthesis::EnGrammarSynthesizer_EnDisplayFunction private: ::std::u16string guessPluralInflection(const ::std::u16string& displayString) const; ::std::u16string guessSingularInflection(const ::std::u16string& displayString) const; - ::std::u16string inflectPossessive(const std::u16string &displayString, std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &valueConstraints) const; + ::std::u16string inflectPossessive(const std::u16string &displayString, std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &valueConstraints, bool isRequestingPlural) const; ::std::optional<::std::u16string> inflectPhrase(const std::u16string &originalString, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool enableInflectionGuess) const; public: diff --git a/inflection/test/src/inflection/dictionary/DictionaryMetaDataTest.cpp b/inflection/test/src/inflection/dictionary/DictionaryMetaDataTest.cpp index a5f4270a..bf53c759 100644 --- a/inflection/test/src/inflection/dictionary/DictionaryMetaDataTest.cpp +++ b/inflection/test/src/inflection/dictionary/DictionaryMetaDataTest.cpp @@ -1,4 +1,5 @@ /* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. * Copyright 2016-2024 Apple Inc. All rights reserved. */ #include "catch2/catch_test_macros.hpp" @@ -33,7 +34,6 @@ TEST_CASE("DictionaryMetaDataTest#testEnglish") REQUIRE_FALSE(npc(dictionary)->getPropertyValues(u"man", u"inflection").empty()); REQUIRE_FALSE(npc(dictionary)->getPropertyValues(u"theories", u"inflection").empty()); REQUIRE_FALSE(npc(dictionary)->getPropertyValues(u"theory", u"inflection").empty()); - REQUIRE_FALSE(npc(dictionary)->getPropertyValues(u"United States", u"inflection").empty()); REQUIRE(npc(dictionary)->hasProperty(u"Paris", u"proper-noun")); REQUIRE(npc(dictionary)->hasProperty(u"paris", u"proper-noun")); int64_t properties = 0; diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/DocumentState.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/DocumentState.java new file mode 100644 index 00000000..615d156f --- /dev/null +++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/DocumentState.java @@ -0,0 +1,143 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + * Copyright 2020-2024 Apple Inc. All rights reserved. + */ +package org.unicode.wikidata; + +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.nio.charset.StandardCharsets; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; +import java.util.regex.Pattern; + +/** + * Contains statistical information on what has been analyzed. + */ +final class DocumentState { + int lemmaCount = 0; + int unusableLemmaCount = 0; + int unusableSurfaceFormCount = 0; + int mergedCount = 0; + int incomingSurfaceForm = 0; + TreeMap dictionary = new TreeMap<>(); + ArrayList inflectionPatterns = new ArrayList<>(1024); + + boolean isInflectional() { + return inflectionPatterns.size() > 1 || (inflectionPatterns.size() == 1 && inflectionPatterns.get(0).getCount() > 1); + } + + private void sortInflectionPatterns(ArrayList inflectionPatterns) { + // We are sorting for the common ones first and then compare the identifier for lack of a better + inflectionPatterns.sort(Comparator + .comparing(InflectionPattern::getCount) + .reversed() + .thenComparing(InflectionPattern::getID)); + int identifierEnumeration = 1; + for (InflectionPattern inflectionPattern : inflectionPatterns) { + inflectionPattern.setID(identifierEnumeration++); // This is where we are reassigning identifiers to their new values. + } + } + + public void addDictionaryEntry(DictionaryEntry dictionaryEntry) { + String phrase = dictionaryEntry.phrase; + DictionaryEntry existingDictionaryEntry = dictionary.get(phrase); + if (existingDictionaryEntry == null) { + dictionary.put(phrase, dictionaryEntry); + } else { + mergedCount++; + existingDictionaryEntry.merge(dictionaryEntry); + } + } + + void printDocument(ParserOptions parserOptions, long startTime) throws FileNotFoundException { + TreeMap, Integer> grammemeCounts = new TreeMap<>(EnumComparator.ENUM_COMPARATOR); + int unclassifiedTerms = 0; + if (isInflectional()) { + try (PrintWriter inflectionalStream = new PrintWriter(new OutputStreamWriter( + new FileOutputStream(parserOptions.inflectionalFilename), StandardCharsets.UTF_8))) { + inflectionalStream.println("\n" + + ""); + + sortInflectionPatterns(inflectionPatterns); + for (InflectionPattern inflectionPattern : inflectionPatterns) { + inflectionalStream.print(inflectionPattern); + } + inflectionalStream.println(""); + } + } + try (PrintWriter lexicalDictionaryStream = new PrintWriter(new OutputStreamWriter( + new FileOutputStream(parserOptions.lexicalDictionaryFilename), StandardCharsets.UTF_8))) { + for (Map.Entry entry : dictionary.entrySet()) { + DictionaryEntry dictionaryEntry = entry.getValue(); + if (dictionaryEntry.getGrammemes().isEmpty()) { + // We don't care about only known words. We need grammeme data + unclassifiedTerms++; + continue; + } + // Print the dictionary entry to the .lst file. + lexicalDictionaryStream.println(dictionaryEntry.toString(isInflectional())); + for (Enum grammeme : dictionaryEntry.getGrammemes()) { + grammemeCounts.merge(grammeme, 1, Integer::sum); + } + } + + NumberFormat percentFormat = NumberFormat.getPercentInstance(Locale.US); + percentFormat.setMaximumFractionDigits(1); + int dictionarySize = dictionary.size(); + StringBuilder source = new StringBuilder(); + Pattern anythingSlash = Pattern.compile(".*/"); + for (String sourceFilename : parserOptions.sourceFilenames) { + source.append(anythingSlash.matcher(sourceFilename).replaceAll("")).append(" "); + } + lexicalDictionaryStream.println("=============================================="); + lexicalDictionaryStream.printf("%30s %7s%n", "Source:", source); + lexicalDictionaryStream.printf("%30s %7d%n", "Lemma terms:", lemmaCount); + lexicalDictionaryStream.printf("%30s %7d%n", "Unusable lemma terms:", unusableLemmaCount); + lexicalDictionaryStream.printf("%30s %7d%n", "Incoming surface forms:", incomingSurfaceForm); + lexicalDictionaryStream.printf("%30s %7d%n", "Surface forms:", dictionarySize); + lexicalDictionaryStream.printf("%30s %7d %7s%n", "Collapsed surface forms:", mergedCount, '(' + percentFormat.format((mergedCount) / (double) incomingSurfaceForm) + ')'); + lexicalDictionaryStream.printf("%30s %7d%n", "Unusable surface forms:", unusableSurfaceFormCount); + lexicalDictionaryStream.printf("%30s %7d %7s%n", "Usable terms:", dictionarySize - unclassifiedTerms, '(' + percentFormat.format((dictionarySize - unclassifiedTerms) / (double) dictionarySize) + ')'); + lexicalDictionaryStream.printf("%30s %7d %7s%n", "Unclassified terms:", unclassifiedTerms, '(' + percentFormat.format(unclassifiedTerms / (double) dictionarySize) + ')'); + lexicalDictionaryStream.println("=============================================="); + TreeMap>> categories = new TreeMap<>(); + for (var entry : grammemeCounts.entrySet()) { + var entryCategory = entry.getKey().getClass().getSimpleName(); + if (!categories.containsKey(entryCategory)) { + categories.put(entryCategory, new ArrayList<>()); + } + var categoryValues = categories.get(entryCategory); + categoryValues.add(entry.getKey()); + } + + for (var categoryEntry : categories.entrySet()) { + var categoryName = categoryEntry.getKey(); + lexicalDictionaryStream.printf("%s:%n", categoryName); + var categoryValues = categoryEntry.getValue(); + categoryValues.sort(Comparator.comparing(grammemeCounts::get)); + Collections.reverse(categoryValues); + for (var categoryValue : categoryValues) { + lexicalDictionaryStream.printf(" %-20s %7d %7s%n", categoryValue.toString() + ':', grammemeCounts.get(categoryValue), '(' + percentFormat.format(grammemeCounts.get(categoryValue) / (double) dictionarySize) + ')'); + } + lexicalDictionaryStream.printf("%n"); + } + long endTime = System.currentTimeMillis(); + long elapsedTime = (endTime - startTime); + lexicalDictionaryStream.println("processed in " + (elapsedTime / 1000) + '.' + (elapsedTime % 1000) + " seconds"); + lexicalDictionaryStream.println("License: Creative Commons CC0 License (https://creativecommons.org/publicdomain/zero/1.0/)"); + lexicalDictionaryStream.println("generated with options: " + String.join(" ", parserOptions.optionsUsedToInvoke)); + } + } + + DocumentState() { + } +} diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java index 631ee973..4e335023 100644 --- a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java +++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java @@ -233,8 +233,7 @@ public String toString() { } enum FormType { - SHORT_FORM, - IRREGULAR; + SHORT_FORM; private final String printableValue; FormType() { @@ -491,6 +490,12 @@ enum Sound { RIEUL_END, VOWEL_START, VOWEL_END, + BACK_ROUND, + BACK_UNROUND, + FRONT_ROUND, + FRONT_UNROUND, + HARD_CONSONANT, + SOFT_CONSONANT, ; private final String printableValue; @@ -509,6 +514,7 @@ enum Register { FAMILIAR, FORMAL, HIGH, + PEJORATIVE, // pejorative (or derogatory) A word form expressing a negative or belittling attitude towards the person or thing referred to INFORMAL, INTIMATE, LITERARY; @@ -538,8 +544,17 @@ public String toString() { } } + static final Map> DEFAULTMAP = new HashMap<>(1021); + + static { + for (var enumClass : Grammar.class.getDeclaredClasses()) { + for (var enumValue : enumClass.getEnumConstants()) { + DEFAULTMAP.put(enumValue.toString(), (Enum)enumValue); + } + } + } + static final Map>> TYPEMAP = new HashMap<>(1021); - static final Map REMAP = new HashMap<>(1021); static { @@ -553,6 +568,7 @@ public String toString() { TYPEMAP.put("Q918270", EnumSet.of(PartOfSpeech.ABBREVIATION)); // initalism TYPEMAP.put("Q30619513", EnumSet.of(PartOfSpeech.ABBREVIATION)); // USPS abbreviation TYPEMAP.put("Q126473", EnumSet.of(PartOfSpeech.ABBREVIATION)); // contraction + TYPEMAP.put("Q1130279", EnumSet.of(PartOfSpeech.ABBREVIATION)); // hypocorism, short nickname TYPEMAP.put("Q34698", EnumSet.of(PartOfSpeech.ADJECTIVE)); TYPEMAP.put("Q12259986", EnumSet.of(PartOfSpeech.ADJECTIVE)); // prenominal adjective TYPEMAP.put("Q7233569", EnumSet.of(PartOfSpeech.ADJECTIVE)); // postpositive adjective @@ -560,6 +576,7 @@ public String toString() { TYPEMAP.put("Q1091269", EnumSet.of(PartOfSpeech.ADJECTIVE)); // na-adjective in Japanese TYPEMAP.put("Q7250170", EnumSet.of(PartOfSpeech.ADJECTIVE)); // proper adjective, the adjective form of a proper noun TYPEMAP.put("Q332375", EnumSet.of(PartOfSpeech.ADJECTIVE)); // absolute adjective (uncomparable adjective) + TYPEMAP.put("Q3618903", EnumSet.of(PartOfSpeech.ADJECTIVE)); // indefinite adjective TYPEMAP.put("Q380057", EnumSet.of(PartOfSpeech.ADVERB)); TYPEMAP.put("Q1668170", EnumSet.of(PartOfSpeech.INTERROGATIVE, PartOfSpeech.ADVERB)); TYPEMAP.put("Q1522423", EnumSet.of(PartOfSpeech.ADVERB)); // locative adverb, but we don't need that much precision about the type. @@ -573,6 +590,7 @@ public String toString() { TYPEMAP.put("Q28833099", EnumSet.of(PartOfSpeech.CONJUNCTION)); // coordinating conjunction TYPEMAP.put("Q576271", EnumSet.of(PartOfSpeech.DETERMINER)); TYPEMAP.put("Q5051", new HashSet<>(Arrays.asList(Case.GENITIVE, PartOfSpeech.DETERMINER))); // possessive determiner + TYPEMAP.put("Q2824480", EnumSet.of(PartOfSpeech.DETERMINER)); // demonstrative adjective, but it's really a determiner. TYPEMAP.put("Q83034", EnumSet.of(PartOfSpeech.INTERJECTION)); TYPEMAP.put("Q2304610", EnumSet.of(PartOfSpeech.INTERROGATIVE)); TYPEMAP.put("Q12021746", EnumSet.of(PartOfSpeech.INTERROGATIVE)); @@ -598,9 +616,11 @@ public String toString() { TYPEMAP.put("Q115762248", EnumSet.of(PartOfSpeech.PARTICLE)); // vocative particle TYPEMAP.put("Q113076880", EnumSet.of(PartOfSpeech.ADVERB)); // postpositive adverb TYPEMAP.put("Q65807752", EnumSet.of(PartOfSpeech.ADVERB)); // demonstrative adverb + TYPEMAP.put("Q134316", EnumSet.of(PartOfSpeech.ADPOSITION)); // adposition TYPEMAP.put("Q161873", EnumSet.of(PartOfSpeech.ADPOSITION)); // postposition TYPEMAP.put("Q4833830", EnumSet.of(PartOfSpeech.ADPOSITION)); // preposition TYPEMAP.put("Q36224", EnumSet.of(PartOfSpeech.PRONOUN)); + TYPEMAP.put("Q2006180", EnumSet.of(PartOfSpeech.PRONOUN)); // pro-form, word that substitutes for another word, broader scope than pronoun TYPEMAP.put("Q147276", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // proper noun TYPEMAP.put("Q7884789", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // toponym TYPEMAP.put("Q43229", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // organization @@ -611,10 +631,12 @@ public String toString() { // TYPEMAP.put("Q1350145", EnumSet.of(PartOfSpeech.VERB, PartOfSpeech.NOUN)); // verbal noun, like boxing TYPEMAP.put("Q11399805", EnumSet.of(PartOfSpeech.VERB)); // auxiliary verb TYPEMAP.put("Q131431824", EnumSet.of(PartOfSpeech.VERB)); // proper verb where you use a proper noun as a verb + TYPEMAP.put("Q3254028", EnumSet.of(PartOfSpeech.VERB)); // separable verb, verb with a prefix which separates from the core verb in certain positions in a sentence TYPEMAP.put("Q4239848", new HashSet<>(Arrays.asList(FormType.SHORT_FORM, PartOfSpeech.ADJECTIVE))); // short form of an adjective - TYPEMAP.put("short-form", EnumSet.of(FormType.SHORT_FORM)); - TYPEMAP.put("irregular", EnumSet.of(FormType.IRREGULAR)); + TYPEMAP.put("Q112154", EnumSet.of(FormType.SHORT_FORM)); // apocope, loss of word-final sounds + TYPEMAP.put("Q650250", EnumSet.of(FormType.SHORT_FORM)); // elision, omission of one or more sounds in a word + TYPEMAP.put("Q114092330", EnumSet.of(FormType.SHORT_FORM)); // prevocalic form, linguistic feature marking a linguistic unit as appearing only before vowels TYPEMAP.put("Q109267112", EnumSet.of(Polarity.AFFIRMATIVE)); TYPEMAP.put("Q1478451", EnumSet.of(Polarity.NEGATIVE)); @@ -647,13 +669,17 @@ public String toString() { TYPEMAP.put("Q53998049", EnumSet.of(Count.UNCOUNTABLE)); // indefinite number, neither singular nor plural, uncountable. Unmarked appears in declension when it is not necessary to specify singular or plural, such as because it is a proper name or is next to a determiner or a quantifier. TYPEMAP.put("stressed", EnumSet.of(Emphasis.STRESSED)); + TYPEMAP.put("Q55464002", EnumSet.of(Emphasis.STRESSED)); // strong form TYPEMAP.put("unstressed", EnumSet.of(Emphasis.UNSTRESSED)); + TYPEMAP.put("Q55464014", EnumSet.of(Emphasis.UNSTRESSED)); // weak form TYPEMAP.put("Q499327", EnumSet.of(Gender.MASCULINE)); TYPEMAP.put("Q54020116", new HashSet<>(Arrays.asList(Gender.MASCULINE, Animacy.ANIMATE))); TYPEMAP.put("Q52943434", new HashSet<>(Arrays.asList(Gender.MASCULINE, Animacy.INANIMATE))); TYPEMAP.put("Q27918551", new HashSet<>(Arrays.asList(Gender.MASCULINE, Animacy.HUMAN))); // masculine personal TYPEMAP.put("Q52943193", new HashSet<>(Arrays.asList(Gender.MASCULINE, Animacy.ANIMATE))); // masculine animate non-personal + TYPEMAP.put("Q18478758", new HashSet<>(Arrays.asList(Gender.MASCULINE, Gender.FEMININE))); // common of two genders + TYPEMAP.put("Q100919075", new HashSet<>(Arrays.asList(Gender.MASCULINE, Gender.FEMININE))); // ambiguous gender TYPEMAP.put("Q1775415", EnumSet.of(Gender.FEMININE)); TYPEMAP.put("Q1775461", EnumSet.of(Gender.NEUTER)); TYPEMAP.put("Q1305037", EnumSet.of(Gender.COMMON)); @@ -713,12 +739,12 @@ public String toString() { TYPEMAP.put("Q956030", new HashSet<>(Arrays.asList(Definiteness.INDEFINITE, PartOfSpeech.PRONOUN))); // TYPEMAP.put(asTreeSet("Q53998049"), EnumSet.of(Definiteness.INDEFINITE)); // indefinite number TYPEMAP.put("Q10265745", new HashSet<>(Arrays.asList(Definiteness.DEMONSTRATIVE, PartOfSpeech.DETERMINER))); // demonstrative determiner + TYPEMAP.put("Q79377486", new HashSet<>(Arrays.asList(Definiteness.DEMONSTRATIVE, PartOfSpeech.DETERMINER))); // distal, demonstrative TYPEMAP.put("Q10345583", new HashSet<>(Arrays.asList(Tense.PRESENT, VerbType.PARTICIPLE))); TYPEMAP.put("Q1230649", new HashSet<>(Arrays.asList(Tense.PAST, VerbType.PARTICIPLE))); TYPEMAP.put("Q72249355", new HashSet<>(Arrays.asList(Voice.ACTIVE, VerbType.PARTICIPLE))); TYPEMAP.put("Q72249544", new HashSet<>(Arrays.asList(Voice.PASSIVE, VerbType.PARTICIPLE))); - TYPEMAP.put("Q112785242", new HashSet<>(Arrays.asList(Aspect.IMPERFECT, VerbType.PARTICIPLE))); // imperfect participle TYPEMAP.put("Q113133303", EnumSet.of(VerbType.PARTICIPLE)); // conjunctive participle TYPEMAP.put("Q192613", EnumSet.of(Tense.PRESENT)); // present tense TYPEMAP.put("Q3910936", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.PRESENT))); // simple present and usually future @@ -769,6 +795,7 @@ public String toString() { TYPEMAP.put("Q953129", EnumSet.of(PartOfSpeech.PRONOUN)); // reflexive pronoun TYPEMAP.put("Q130266209", EnumSet.of(PartOfSpeech.PRONOUN)); // reflexive personal pronoun TYPEMAP.put("Q1050744", EnumSet.of(PartOfSpeech.PRONOUN)); // relative pronoun + TYPEMAP.put("Q1462657", EnumSet.of(PartOfSpeech.PRONOUN)); // reciprocal pronoun TYPEMAP.put("Q625581", EnumSet.of(Mood.CONDITIONAL)); TYPEMAP.put("Q3686414", new HashSet<>(Arrays.asList(Tense.PRESENT, Mood.CONDITIONAL))); // conditional present @@ -808,7 +835,7 @@ public String toString() { TYPEMAP.put("Q108524486", EnumSet.of(Aspect.IMPERFECT)); TYPEMAP.put("Q7240943", new HashSet<>(Arrays.asList(Tense.PRESENT, Aspect.IMPERFECT))); // present continuous/present imperfect TYPEMAP.put("Q56650537", new HashSet<>(Arrays.asList(Tense.PAST, Aspect.IMPERFECT))); // past continuous/present imperfect - TYPEMAP.put("Q56650537", new HashSet<>(Arrays.asList(Aspect.IMPERFECT, VerbType.PARTICIPLE))); // imperfect participle + TYPEMAP.put("Q112785242", new HashSet<>(Arrays.asList(Aspect.IMPERFECT, VerbType.PARTICIPLE))); // imperfect participle TYPEMAP.put("Q113115936", new HashSet<>(Arrays.asList(Aspect.PERFECT, VerbType.PARTICIPLE))); // perfect participle TYPEMAP.put("Q623742", EnumSet.of(Aspect.PLUPERFECT)); @@ -823,6 +850,9 @@ public String toString() { TYPEMAP.put("Q56650485", new HashSet<>(Arrays.asList(Person.SECOND, Register.INFORMAL))); TYPEMAP.put("Q66664394", EnumSet.of(Register.INTIMATE)); // endearing TYPEMAP.put("high", EnumSet.of(Register.HIGH)); + TYPEMAP.put("Q545779", EnumSet.of(Register.PEJORATIVE)); // pejorative + TYPEMAP.put("Q54948374", EnumSet.of(Register.PEJORATIVE)); // depreciative form + TYPEMAP.put("Q1521634", EnumSet.of(Register.PEJORATIVE)); // vulgarism TYPEMAP.put("Q75242466", EnumSet.of(Register.CONVERSATIONAL)); // chalita bhasha TYPEMAP.put("Q55228835", EnumSet.of(Register.CONVERSATIONAL)); // colloquial form TYPEMAP.put("Q20613396", EnumSet.of(Register.LITERARY)); // historical language style that was used in 19th and 20th century Bangla literary works @@ -834,6 +864,7 @@ public String toString() { TYPEMAP.put("Q1358239", EnumSet.of(Sizeness.AUGMENTATIVE)); TYPEMAP.put("Q221446", EnumSet.of(Sizeness.AUGMENTATIVE)); // reduplication in Japanese + TYPEMAP.put("Q6029894", EnumSet.of(Sizeness.AUGMENTATIVE)); // intensive TYPEMAP.put("Q108709", EnumSet.of(Sizeness.DIMINUTIVE)); TYPEMAP.put("consonant-end", EnumSet.of(Sound.CONSONANT_END)); @@ -841,9 +872,6 @@ public String toString() { TYPEMAP.put("rieul-end", EnumSet.of(Sound.RIEUL_END)); TYPEMAP.put("vowel-end", EnumSet.of(Sound.VOWEL_END)); TYPEMAP.put("vowel-start", EnumSet.of(Sound.VOWEL_START)); -// TYPEMAP.put("Q650250", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // elision, omission of one or more sounds in a word or phrase -// TYPEMAP.put("Q114092330", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // prevocalic form, linguistic feature marking a linguistic unit as appearing only before vowels -// TYPEMAP.put("Q112154", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // apocope, loss of word-final sounds TYPEMAP.put("Q101252532", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // where consonant is unmutated TYPEMAP.put("Q56648699", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // soft mutation, where consonant becomes more sonorous TYPEMAP.put("Q117262361", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // pausal form, form of a word realised in hiatus between prosodic units @@ -856,6 +884,7 @@ public String toString() { TYPEMAP.put("standard", EnumSet.of(Usage.STANDARD)); TYPEMAP.put("Q55094451", EnumSet.of(Usage.RARE)); // rare form + TYPEMAP.put("Q58157328", EnumSet.of(Usage.RARE)); // rare, indicates whether lexeme sense is used rarely TYPEMAP.put("Q8102", EnumSet.of(Usage.RARE)); // slang TYPEMAP.put("Q12237354", EnumSet.of(Usage.RARE)); // obsolete word TYPEMAP.put("Q54943392", EnumSet.of(Usage.RARE)); // obsolete form @@ -883,6 +912,7 @@ public String toString() { TYPEMAP.put("Q56042915", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // prepositional phrase TYPEMAP.put("Q1778442", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // verb phrase TYPEMAP.put("Q384876", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // set phrase + TYPEMAP.put("Q3062294", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // Latin phrase TYPEMAP.put("Q1527589", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // phrasal verb TYPEMAP.put("Q117606981", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // verbo-nominal syntagma TYPEMAP.put("Q12734432", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // attributive locution, phrase that grammatically is used as attribute @@ -922,9 +952,11 @@ public String toString() { TYPEMAP.put("Q43249", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // morpheme TYPEMAP.put("Q126728876", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // nominal modifier, suffix deriving a noun from a preceding noun TYPEMAP.put("Q126734687", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // verbal modifier, verbal derivational suffix + TYPEMAP.put("Q361669", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // modal particle TYPEMAP.put("Q134830", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // prefix TYPEMAP.put("Q54792077", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // prefixoid TYPEMAP.put("Q125858556", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // number-person prefix + TYPEMAP.put("Q1552433", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // preverb TYPEMAP.put("Q62155", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // affix TYPEMAP.put("Q109249055", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // pseudo-affix TYPEMAP.put("Q1153504", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // interfix @@ -949,11 +981,12 @@ public String toString() { TYPEMAP.put("Q18915698", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // established collocation TYPEMAP.put("Q1428334", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // paradigm, an inflection table instead of actual words TYPEMAP.put("Q102500", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // chemical symbol - TYPEMAP.put("Q80071", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // symbol + TYPEMAP.put("Q80071", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // symbol TYPEMAP.put("Q308229", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // currency sign TYPEMAP.put("Q31963", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // emoticon TYPEMAP.put("Q1668151", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // semantic punctuation mark TYPEMAP.put("Q1984758", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // misspelling, not helpful + TYPEMAP.put("Q56161479", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // incorrect form, not helpful // Types that are algorithmically added instead of stored. TYPEMAP.put("Q69761768", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // feminine possessive @@ -981,7 +1014,11 @@ public String toString() { TYPEMAP.put("Q98772589", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // expanded contraction TYPEMAP.put("Q1192464", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // rendaku in Japanese TYPEMAP.put("Q126897884", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // denominal + TYPEMAP.put("Q58233068", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // humorous + TYPEMAP.put("Q43747", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // Internet slang + TYPEMAP.put("Q89522629", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // poetic form TYPEMAP.put("Q213458", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // clitic + TYPEMAP.put("Q6548647", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // enclitic TYPEMAP.put("Q340015", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // deixis, words requiring context to understand their meaning TYPEMAP.put("Q162940", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // diacritic @@ -1057,6 +1094,6 @@ public String toString() { } static Set> getMappedGrammemes(String grammeme) { - return TYPEMAP.get(REMAP.getOrDefault(grammeme, grammeme)); + return TYPEMAP.get(grammeme); } } diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/GrammemeSetComparator.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/GrammemeSetComparator.java new file mode 100644 index 00000000..5b1dbf07 --- /dev/null +++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/GrammemeSetComparator.java @@ -0,0 +1,27 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + */ +package org.unicode.wikidata; + +import java.util.Comparator; +import java.util.Set; + +public class GrammemeSetComparator implements Comparator>> { + @Override + public int compare(Set> list1, Set> list2) { + var size1 = list1.size(); + var size2 = list2.size(); + if (size1 != size2) { + return size1 - size2; + } + var list2Itr = list2.iterator(); + for (var grammmemeEnum : list1) { + var cmpResult = Inflection.ENUM_COMPARATOR.compare(grammmemeEnum, list2Itr.next()); + if (cmpResult != 0) { + return cmpResult; + } + } + return 0; + } + static final GrammemeSetComparator ENUM_COMPARATOR = new GrammemeSetComparator(); +} diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java index 94eb9d0a..ca6b62cd 100644 --- a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java +++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java @@ -5,20 +5,13 @@ package org.unicode.wikidata; import java.io.BufferedInputStream; -import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; +import java.io.IOException; import java.nio.file.Paths; -import java.text.NumberFormat; -import java.util.EnumMap; -import java.util.Locale; import java.util.Properties; -import java.util.regex.Pattern; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -26,16 +19,13 @@ import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; -import com.ibm.icu.util.ULocale; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.lang3.StringUtils; import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; -import java.util.Comparator; import java.util.EnumSet; import java.util.HashSet; import java.util.List; @@ -44,389 +34,10 @@ import java.util.TreeMap; import java.util.TreeSet; -import static org.unicode.wikidata.Grammar.TYPEMAP; -import static org.unicode.wikidata.Grammar.REMAP; import static org.unicode.wikidata.Grammar.Ignorable; import static org.unicode.wikidata.Grammar.PartOfSpeech; import static org.unicode.wikidata.Grammar.Sound; -/** - * Default parser option values. - */ -final class ParserDefaults { - static final String RESOURCES_DIR = "src/main/resources/org/unicode/wikidata/"; - static final String DEFAULT_INFLECTION_FILE_NAME = "inflectional.xml"; - static final String DEFAULT_DICTIONARY_FILE_NAME = "dictionary.lst"; - // Put the rare inflections at the end. - static final Comparator RARITY_AWARE_COMPARATOR = Comparator - .comparing(Inflection::isRareUsage) - .thenComparing(Inflection::compareTo); - private ParserDefaults() {} -} - -/** - * The options to extract the data from the data source. - */ -final class ParserOptions { - private static final char COLON_SEPARATOR = ':'; - static final String INFLECTIONS_FILE = "--inflections"; - static final String DICTIONARY_FILE = "--dictionary"; - static final String MAP_GRAMMEME = "--map-grammeme"; - static final String ADD_EXTRA_GRAMMEMES = "--add-extra-grammemes"; - static final String INFLECTION_TYPES = "--inflection-types"; - static final String IGNORE_GRAMMEMES_FOR_TYPES = "--ignore-grammemes-for-types"; - static final String IGNORE_PROPERTY = "--ignore-property"; - static final String INCLUDE_LEMMAS_WITHOUT_WORD = "--include-lemmas-without-words"; - static final String IGNORE_SURFACE_FORM = "--ignore-entries-with-grammemes"; - static final String IGNORE_UNANNOTATED_SURFACE_FORM = "--ignore-unannotated-entries"; - static final String ADD_NORMALIZED_ENTRY = "--add-normalized-entry"; - static final String LANGUAGE_OPT = "--language"; - static final String TIMESTAMP = "--timestamp"; - static final String ADD_DEFAULT_GRAMMEME_FOR_CATEGORY = "--add-default-grammeme-for-category"; - static final String IGNORE_UNSTRUCTURED_ENTRIES = "--ignore-unstructured-entries"; - static final String ADD_SOUND = "--add-sound"; - - boolean includeLemmasWithoutWords = false; - boolean ignoreUnannotated = false; - boolean addNormalizedEntry = false; - boolean ignoreUnstructuredEntries = false; - boolean debug = false; - final boolean addSound; - - EnumSet posToBeInflected; - TreeSet posWithoutGrammemes; - TreeMap> additionalGrammemesDict; - TreeMap> defaultGrammemeForCategory; - TreeMap> claimsToSound; - - ArrayList sourceFilenames; - String inflectionalFilename = ParserDefaults.DEFAULT_INFLECTION_FILE_NAME; - String lexicalDictionaryFilename = ParserDefaults.DEFAULT_DICTIONARY_FILE_NAME; - ArrayList locales = new ArrayList<>(List.of(Locale.ENGLISH.getLanguage())); - List optionsUsedToInvoke = new ArrayList<>(); - - private static void printUsage() { - System.err.println("Usage: ParseLexicon [OPTIONS] [ ...]"); - System.err.println("\nOPTIONS"); - System.err.println(INFLECTIONS_FILE + " \tthe file for the inflectional patterns to be generated, default: inflectional.xml"); - System.err.println(DICTIONARY_FILE + " \tthe file for the lexical dictionary to be generated, default: dictionary.lst"); - System.err.println(ADD_EXTRA_GRAMMEMES + " \tFile containing words with the extra grammemes to be added, provide path relative to tools/dictionary-parser/src/main/resources/org/unicode/wikidata/ (only to be used for a temporary grammeme addition)"); - System.err.println(INFLECTION_TYPES + " pos1[,pos2,...]\tthe pos's to be inflected, default: noun"); - System.err.println(IGNORE_GRAMMEMES_FOR_TYPES + " pos1[,pos2,...]\tthe part of speeches for which we don't want to include any grammeme info other than vowel/consonant start, default: (NONE)"); - System.err.println(MAP_GRAMMEME + " grammeme1,grammeme2\twhen grammeme1 is seen in the source dictionary, use grammeme2 instead of it"); - System.err.println(IGNORE_PROPERTY + " grammeme1[,grammeme2,...]\teach property is considered to be an ignorable property."); - System.err.println(IGNORE_SURFACE_FORM + " type1[,type2,...]\tignore entries with specified grammemes. Default: do not ignore"); - System.err.println(IGNORE_UNANNOTATED_SURFACE_FORM + " \tignore entries without any grammeme annotation. Default: do not ignore"); - System.err.println(INCLUDE_LEMMAS_WITHOUT_WORD + "\tinclude lemma entries which do not have corresponding word-entry. Default: do not include"); - System.err.println(TIMESTAMP + "\ttimestamp of the latest lexicon used. Default: NONE"); - System.err.println(LANGUAGE_OPT + "\tComma separated list of languages to extract to the lexical dictionary. Default: " + ULocale.ENGLISH.getName()); - System.err.println(ADD_NORMALIZED_ENTRY + "\tAdds the normalized entry of a dictionary as an additional dictionary entry, only applies for non lowercase entries. Default: false"); - System.err.println(ADD_DEFAULT_GRAMMEME_FOR_CATEGORY + "\t[pos=partofSpeech1]category1=grammeme1[,category2=grammeme2.....]\t For each of the provided categories if no grammeme is present then add the default grammeme provided for that category to the word. Only applies for the provided parts of speech if pos= is supplied Default: (NONE)"); - System.err.println(IGNORE_UNSTRUCTURED_ENTRIES + " \tIgnore unstructured entries from the lexicon. Default: false"); - System.err.println(ADD_SOUND + " grammeme1[,grammeme2,...]\tSound properties to check for."); - } - - ParserOptions(String[] args) throws Exception{ - posToBeInflected = EnumSet.of(PartOfSpeech.NOUN); - posWithoutGrammemes = new TreeSet<>(); - additionalGrammemesDict = new TreeMap<>(); - sourceFilenames = new ArrayList<>(); - defaultGrammemeForCategory = new TreeMap<>(); - claimsToSound = new TreeMap<>(); - - for (int i = 0; i < args.length; i++) { - String arg = args[i]; - if (ParserOptions.INFLECTIONS_FILE.equals(arg)) { - inflectionalFilename = args[++i]; - } else if (ParserOptions.DICTIONARY_FILE.equals(arg)) { - lexicalDictionaryFilename = args[++i]; - } else if (ParserOptions.ADD_EXTRA_GRAMMEMES.equals(arg)) { - String additionalGrammemeFilename = args[++i]; - String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + additionalGrammemeFilename).toAbsolutePath().toString(); - try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8))) { - String line; - while ((line = br.readLine()) != null) { - int colonIdx = line.indexOf(COLON_SEPARATOR); - String phrase = line.substring(0, colonIdx); - String grammemes = line.substring(colonIdx + 1).trim(); - additionalGrammemesDict.put(phrase, new TreeSet<>(Arrays.asList(grammemes.split(" ")))); - } - optionsUsedToInvoke.add(ParserOptions.ADD_EXTRA_GRAMMEMES); - optionsUsedToInvoke.add(additionalGrammemeFilename); - } - } else if (ParserOptions.MAP_GRAMMEME.equals(arg)) { - String mapGrammeme = args[++i]; - String[] split = mapGrammeme.split(",", 2); - REMAP.put(split[0], split[1]); - - optionsUsedToInvoke.add(ParserOptions.MAP_GRAMMEME); - optionsUsedToInvoke.add(mapGrammeme); - } else if (ParserOptions.IGNORE_PROPERTY.equals(arg)) { - String propertySetToIgnore = args[++i]; - setIgnoreProperty(propertySetToIgnore.split(","), Ignorable.IGNORABLE_PROPERTY); - optionsUsedToInvoke.add(ParserOptions.IGNORE_PROPERTY); - optionsUsedToInvoke.add(propertySetToIgnore); - } else if (ParserOptions.INFLECTION_TYPES.equals(arg)) { - String inflectionTypes = args[++i]; - posToBeInflected.clear(); - - for (String pos : inflectionTypes.split(",")) { - posToBeInflected.add(PartOfSpeech.valueOf(pos.toUpperCase())); - } - - optionsUsedToInvoke.add(ParserOptions.INFLECTION_TYPES); - optionsUsedToInvoke.add(inflectionTypes); - } else if (ParserOptions.IGNORE_GRAMMEMES_FOR_TYPES.equals(arg)) { - String ignoredGrammemeTypes = args[++i]; - - posWithoutGrammemes.clear(); - posWithoutGrammemes.addAll(Arrays.asList(ignoredGrammemeTypes.split(","))); - - optionsUsedToInvoke.add(ParserOptions.IGNORE_GRAMMEMES_FOR_TYPES); - optionsUsedToInvoke.add(ignoredGrammemeTypes); - } else if (ParserOptions.INCLUDE_LEMMAS_WITHOUT_WORD.equals(arg)) { - includeLemmasWithoutWords = true; - optionsUsedToInvoke.add(ParserOptions.INCLUDE_LEMMAS_WITHOUT_WORD); - } else if (ParserOptions.IGNORE_SURFACE_FORM.equals(arg)) { - String ignoreEntriesWithGrammemesStr = args[++i]; - setIgnoreProperty(ignoreEntriesWithGrammemesStr.split(","), Ignorable.IGNORABLE_INFLECTION); - optionsUsedToInvoke.add(ParserOptions.IGNORE_SURFACE_FORM); - optionsUsedToInvoke.add(ignoreEntriesWithGrammemesStr); - } else if (ParserOptions.IGNORE_UNANNOTATED_SURFACE_FORM.equals(arg)) { - ignoreUnannotated = true; - optionsUsedToInvoke.add(ParserOptions.IGNORE_UNANNOTATED_SURFACE_FORM); - } else if (ParserOptions.TIMESTAMP.equals(arg)) { - String timestamp = args[++i]; - optionsUsedToInvoke.add(ParserOptions.TIMESTAMP); - optionsUsedToInvoke.add(timestamp); - } else if (ParserOptions.LANGUAGE_OPT.equals(arg)) { - String localeStr = args[++i]; - locales.clear(); - locales.addAll(List.of(localeStr.split(","))); - optionsUsedToInvoke.add(ParserOptions.LANGUAGE_OPT); - optionsUsedToInvoke.add(localeStr); - } else if (ParserOptions.ADD_NORMALIZED_ENTRY.equals(arg)) { - addNormalizedEntry = true; - optionsUsedToInvoke.add(ParserOptions.ADD_NORMALIZED_ENTRY); - } else if (ParserOptions.ADD_DEFAULT_GRAMMEME_FOR_CATEGORY.equals(arg)) { - String categoryDefaultGrammemeString = args[++i]; - String[] tokens = categoryDefaultGrammemeString.split(","); - String posValue = ""; - for (int idx = 0; idx < tokens.length; idx += 1) { - String token = tokens[idx]; - String[] tokenArgs = token.split("="); - if (tokenArgs.length != 2) { - throw new IllegalArgumentException("Default Grammeme for category string does not have entry in the format a=b " + token); - } - String key = tokenArgs[0].toLowerCase(); - String value = tokenArgs[1].toLowerCase(); - if (key.compareTo("pos") == 0) { - if (idx != 0) { - throw new IllegalArgumentException("pos key is not the first argument for default Grammeme for category string " + categoryDefaultGrammemeString); - } - posValue = value; - continue; - } - defaultGrammemeForCategory.putIfAbsent(posValue, new TreeMap<>()); - defaultGrammemeForCategory.get(posValue).put(key, value); - } - - optionsUsedToInvoke.add(ParserOptions.ADD_DEFAULT_GRAMMEME_FOR_CATEGORY); - optionsUsedToInvoke.add(categoryDefaultGrammemeString); - } else if (ParserOptions.IGNORE_UNSTRUCTURED_ENTRIES.equals(arg)) { - ignoreUnstructuredEntries = true; - optionsUsedToInvoke.add(ParserOptions.IGNORE_UNSTRUCTURED_ENTRIES); - } else if (ParserOptions.ADD_SOUND.equals(arg)) { - String soundGrammemeTypes = args[++i]; - - List additionalSoundProperties = Arrays.asList(soundGrammemeTypes.split(",")); - - for (String claimID : ParseWikidata.PROPERTIES_WITH_PRONUNCIATION) { - Properties soundRegexes = new Properties(); - String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + claimID + ".properties").toAbsolutePath().toString(); - try (var propertiesStream = new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8)) { - soundRegexes.load(propertiesStream); - var enumMap = new EnumMap(Sound.class); - for (var entry : soundRegexes.entrySet()) { - var key = (String) entry.getKey(); - if (additionalSoundProperties.contains(key)) { - enumMap.put(Sound.valueOf(key.toUpperCase(Locale.ROOT).replace('-', '_')), Pattern.compile((String)entry.getValue())); - } - } - if (enumMap.size() != additionalSoundProperties.size()) { - throw new IllegalArgumentException("Not all sound properties were found"); - } - claimsToSound.put(claimID, enumMap); - } - } - - optionsUsedToInvoke.add(ParserOptions.ADD_SOUND); - optionsUsedToInvoke.add(soundGrammemeTypes); - } else { - sourceFilenames.add(arg); - } - } - - addSound = !claimsToSound.isEmpty(); - - if (sourceFilenames.isEmpty()) { - printUsage(); - throw new IllegalArgumentException(); - } - } - - void setIgnoreProperty(String[] grammemes, Ignorable ignorable) { - var ignorableSet = EnumSet.of(ignorable); - for (String grammeme : grammemes) { - if (grammeme.matches("Q\\d*")) { - TYPEMAP.put(grammeme, ignorableSet); - } - else { - for (Map.Entry>> entry : TYPEMAP.entrySet()) { - for (var grammemeEnum : entry.getValue()) { - String name = grammemeEnum.name(); - if (name.equalsIgnoreCase(grammeme)) { - if (entry.getValue().size() == 1) { - entry.setValue(ignorableSet); - } - else { - entry.getValue().remove(grammemeEnum); - ArrayList> clone = new ArrayList<>(entry.getValue()); - clone.add(ignorable); - entry.setValue(new HashSet<>(clone)); - } - break; - } - } - } - } - } - } -} - -/** - * Contains statistical information on what has been analyzed. - */ -final class DocumentState { - int lemmaCount = 0; - int unusableLemmaCount = 0; - int unusableSurfaceFormCount = 0; - int mergedCount = 0; - int incomingSurfaceForm = 0; - TreeMap dictionary = new TreeMap<>(); - ArrayList inflectionPatterns = new ArrayList<>(1024); - - boolean isInflectional() { - return inflectionPatterns.size() > 1 || (inflectionPatterns.size() == 1 && inflectionPatterns.get(0).getCount() > 1); - } - - private void sortInflectionPatterns(ArrayList inflectionPatterns) { - // We are sorting for the common ones first and then compare the identifier for lack of a better - inflectionPatterns.sort(Comparator - .comparing(InflectionPattern::getCount) - .reversed() - .thenComparing(InflectionPattern::getID)); - int identifierEnumeration = 1; - for (InflectionPattern inflectionPattern : inflectionPatterns) { - inflectionPattern.setID(identifierEnumeration++); // This is where we are reassigning identifiers to their new values. - } - } - - public void addDictionaryEntry(DictionaryEntry dictionaryEntry){ - String phrase = dictionaryEntry.phrase; - DictionaryEntry existingDictionaryEntry = dictionary.get(phrase); - if (existingDictionaryEntry == null) { - dictionary.put(phrase, dictionaryEntry); - }else{ - mergedCount++; - existingDictionaryEntry.merge(dictionaryEntry); - } - } - - public void printDocument(ParserOptions parserOptions, long startTime) throws FileNotFoundException { - TreeMap, Integer> grammemeCounts = new TreeMap<>(EnumComparator.ENUM_COMPARATOR); - int unclassifiedTerms = 0; - if (isInflectional()) { - try (PrintWriter inflectionalStream = new PrintWriter(new OutputStreamWriter( - new FileOutputStream(parserOptions.inflectionalFilename), StandardCharsets.UTF_8))) { - inflectionalStream.println("\n" + - ""); - - sortInflectionPatterns(inflectionPatterns); - for (InflectionPattern inflectionPattern : inflectionPatterns) { - inflectionalStream.print(inflectionPattern); - } - inflectionalStream.println(""); - } - } - try (PrintWriter lexicalDictionaryStream = new PrintWriter(new OutputStreamWriter( - new FileOutputStream(parserOptions.lexicalDictionaryFilename), StandardCharsets.UTF_8))) { - for (Map.Entry entry : dictionary.entrySet()) { - DictionaryEntry dictionaryEntry = entry.getValue(); - if (dictionaryEntry.getGrammemes().isEmpty()) { - // We don't care about only known words. We need grammeme data - unclassifiedTerms++; - continue; - } - // Print the dictionary entry to the .lst file. - lexicalDictionaryStream.println(dictionaryEntry.toString(isInflectional())); - for (Enum grammeme : dictionaryEntry.getGrammemes()) { - grammemeCounts.merge(grammeme, 1, Integer::sum); - } - } - - NumberFormat percentFormat = NumberFormat.getPercentInstance(Locale.US); - percentFormat.setMaximumFractionDigits(1); - int dictionarySize = dictionary.size(); - StringBuilder source = new StringBuilder(); - Pattern anythingSlash = Pattern.compile(".*/"); - for (String sourceFilename : parserOptions.sourceFilenames) { - source.append(anythingSlash.matcher(sourceFilename).replaceAll("")).append(" "); - } - lexicalDictionaryStream.println("=============================================="); - lexicalDictionaryStream.printf("%30s %7s%n", "Source:", source); - lexicalDictionaryStream.printf("%30s %7d%n", "Lemma terms:", lemmaCount); - lexicalDictionaryStream.printf("%30s %7d%n", "Unusable lemma terms:", unusableLemmaCount); - lexicalDictionaryStream.printf("%30s %7d%n", "Incoming surface forms:", incomingSurfaceForm); - lexicalDictionaryStream.printf("%30s %7d%n", "Surface forms:", dictionarySize); - lexicalDictionaryStream.printf("%30s %7d %7s%n", "Collapsed surface forms:", mergedCount, '(' + percentFormat.format((mergedCount) / (double) incomingSurfaceForm) + ')'); - lexicalDictionaryStream.printf("%30s %7d%n", "Unusable surface forms:", unusableSurfaceFormCount); - lexicalDictionaryStream.printf("%30s %7d %7s%n", "Usable terms:", dictionarySize - unclassifiedTerms, '(' + percentFormat.format((dictionarySize - unclassifiedTerms) / (double) dictionarySize) + ')'); - lexicalDictionaryStream.printf("%30s %7d %7s%n", "Unclassified terms:", unclassifiedTerms, '(' + percentFormat.format(unclassifiedTerms / (double) dictionarySize) + ')'); - lexicalDictionaryStream.println("=============================================="); - TreeMap>> categories = new TreeMap<>(); - for (var entry : grammemeCounts.entrySet()) { - var entryCategory = entry.getKey().getClass().getSimpleName(); - if (!categories.containsKey(entryCategory)) { - categories.put(entryCategory, new ArrayList<>()); - } - var categoryValues = categories.get(entryCategory); - categoryValues.add(entry.getKey()); - } - - for (var categoryEntry : categories.entrySet()) { - var categoryName = categoryEntry.getKey(); - lexicalDictionaryStream.printf("%s:%n", categoryName); - var categoryValues = categoryEntry.getValue(); - categoryValues.sort(Comparator.comparing(grammemeCounts::get)); - Collections.reverse(categoryValues); - for (var categoryValue : categoryValues) { - lexicalDictionaryStream.printf(" %-20s %7d %7s%n", categoryValue.toString() + ':', grammemeCounts.get(categoryValue), '(' + percentFormat.format(grammemeCounts.get(categoryValue) / (double) dictionarySize) + ')'); - } - lexicalDictionaryStream.printf("%n"); - } - long endTime = System.currentTimeMillis(); - long elapsedTime = (endTime-startTime); - lexicalDictionaryStream.println("processed in " + (elapsedTime / 1000) + '.' + (elapsedTime % 1000) + " seconds"); - lexicalDictionaryStream.println("License: Creative Commons CC0 License (https://creativecommons.org/publicdomain/zero/1.0/)"); - lexicalDictionaryStream.println("generated with options: " + String.join(" ", parserOptions.optionsUsedToInvoke)); - } - } - - DocumentState() { - } -} - /** * @see https://dumps.wikimedia.org/wikidatawiki/entities/ */ @@ -436,6 +47,7 @@ public final class ParseWikidata { )); static final Set PROPERTIES_WITH_GRAMMEMES = new TreeSet<>(List.of( "P31", // instance of. Sometimes phrase information is here. + "P1552", // has characteristic for animacy "P5185" // grammatical gender )); static final Set IMPORTANT_PROPERTIES = new TreeSet<>(PROPERTIES_WITH_GRAMMEMES); @@ -462,22 +74,30 @@ private Lemma() {} } private final ParserOptions parserOptions; - private final DocumentState documentState; + private final DocumentState documentState = new DocumentState(); + private final TreeSet rareLemmas = new TreeSet<>(); + private final TreeSet omitLemmas = new TreeSet<>(); ParseWikidata(ParserOptions parserOptions) { this.parserOptions = parserOptions; - this.documentState = new DocumentState(); - } - - private void addGrammeme(TreeSet> grammemes, @Nullable String grammeme) { - if (grammeme != null && !grammeme.isEmpty()) { - Set> values = Grammar.getMappedGrammemes(grammeme); - if (values == null) { - throw new RuntimeException(grammeme + " is not a known grammeme"); + for (var language : parserOptions.locales) { + Properties rareLemmasProperties = new Properties(); + String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + "filter_" + language + ".properties").toAbsolutePath().toString(); + try (var propertiesStream = new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8)) { + rareLemmasProperties.load(propertiesStream); + for (var entry : rareLemmasProperties.entrySet()) { + String key = entry.getKey().toString(); + String value = entry.getValue().toString(); + switch (value) { + case "rare" : rareLemmas.add(key); break; + case "omit" : omitLemmas.add(key); break; + default: throw new IllegalArgumentException(key + ": Unknown key value " + value); + } + } } - else if (!values.contains(Ignorable.IGNORABLE_PROPERTY)) { - grammemes.addAll(values); + catch (IOException e) { + // else oh well. It doesn't matter. } } } @@ -485,6 +105,10 @@ else if (!values.contains(Ignorable.IGNORABLE_PROPERTY)) { static final String VARIANT_SEPARATOR = "-x-"; private void analyzeLexeme(int lineNumber, Lexeme lexeme) { + if (omitLemmas.contains(lexeme.id)) { + // We really don't want this junk. + return; + } Lemma lemma = new Lemma(); Set> partOfSpeechSet = null; for (var lemmaEntry : lexeme.lemmas.entrySet()) { @@ -514,11 +138,15 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) { } lemma.grammemes.addAll(variant); } + if (rareLemmas.contains(lexeme.id)) { + lemma.grammemes.add(Grammar.Usage.RARE); + } extractImportantProperties(lexeme.claims, lemma.grammemes, lexeme.id, lemma.value); if (lemma.grammemes.contains(Ignorable.IGNORABLE_LEMMA) || lemma.grammemes.contains(Ignorable.IGNORABLE_INFLECTION)) { documentState.unusableLemmaCount++; continue; } + lemma.grammemes.remove(Ignorable.IGNORABLE_PROPERTY); for (var form : lexeme.forms) { Inflection currentInflection = null; var representation = form.representations.get(currentLemmaLanguage); @@ -553,18 +181,32 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) { currentInflection.grammemeSet.remove(Grammar.Usage.RARE); } currentInflection.grammemeSet.remove(Ignorable.IGNORABLE_PROPERTY); - lemma.inflections.add(currentInflection); + var grammemeExpansion = parserOptions.expandGramemes != null ? parserOptions.expandGramemes.get(currentInflection.grammemeSet) : null; if (parserOptions.addSound && form.claims != null && !form.claims.isEmpty() && currentInflection.inflection.charAt(0) == lemma.value.charAt(0)) { // We have potential data, and the words aren't mixed together. So this is probably accurate. addSound(form.claims, currentInflection.grammemeSet, lexeme.id, lemma.value); } + if (grammemeExpansion == null) { + lemma.inflections.add(currentInflection); + } + else { + for (var grammemeSet : grammemeExpansion) { + var expandedInflection = new Inflection(currentInflection.inflection, currentInflection.rareUsage); + expandedInflection.grammemeSet.addAll(currentInflection.grammemeSet); + expandedInflection.grammemeSet.addAll(grammemeSet); + lemma.inflections.add(expandedInflection); + } + } } documentState.incomingSurfaceForm += lemma.inflections.size(); lemma.isRare = lemma.grammemes.contains(Grammar.Usage.RARE); if (lemma.isRare) { lemma.grammemes.remove(Grammar.Usage.RARE); } - lemma.grammemes.remove(Ignorable.IGNORABLE_PROPERTY); + if (lemma.inflections.isEmpty()) { + documentState.unusableLemmaCount++; + return; + } analyzeLemma(lemma); } } @@ -761,18 +403,11 @@ private void analyzeInflections(Lemma lemma, List inputInflections) } // else ignore this unimportant inflection pattern. This is usually trimmed for size. } - Locale currLocale = Locale.forLanguageTag(parserOptions.locales.get(0)); for (int i = 0; i < inflections.size() ; i++) { var inflection = inflections.get(i); String phrase = inflection.getInflection(); InflectionPattern inflectionPatternForDict = nonEmptyInflectionIndices.contains(i) ? inflectionPattern : null; documentState.addDictionaryEntry(new DictionaryEntry(phrase, phrase, lemma.isRare, inflection.getGrammemeSet(), inflectionPatternForDict)); - if (parserOptions.addNormalizedEntry) { - String normalizedPhrase = phrase.toLowerCase(currLocale); // locale is specified in the options, by default we use en_US - if (!normalizedPhrase.equals(phrase) && !lemma.isRare) { - documentState.addDictionaryEntry(new DictionaryEntry(normalizedPhrase, phrase, false, inflection.getGrammemeSet(), inflectionPatternForDict)); - } - } } } @@ -812,6 +447,18 @@ private List enumerateInflectionsForGrammemeCombinations(Inflection return resultInflections; } + private void addGrammeme(TreeSet> grammemes, @Nullable String grammeme) { + if (grammeme != null && !grammeme.isEmpty()) { + Enum value = Grammar.DEFAULTMAP.get(grammeme); + if (value == null) { + throw new NullPointerException(grammeme + " is not a known grammeme"); + } + else if (!value.equals(Ignorable.IGNORABLE_PROPERTY)) { + grammemes.add(value); + } + } + } + private void mergeAdditionalGrammemes() { // Add any entries that are missing. The actual properties will be added elsewhere. TreeSet> grammemes = new TreeSet<>(EnumComparator.ENUM_COMPARATOR); diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserDefaults.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserDefaults.java new file mode 100644 index 00000000..1d43e76f --- /dev/null +++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserDefaults.java @@ -0,0 +1,23 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + * Copyright 2020-2024 Apple Inc. All rights reserved. + */ +package org.unicode.wikidata; + +import java.util.Comparator; + +/** + * Default parser option values. + */ +final class ParserDefaults { + static final String RESOURCES_DIR = "src/main/resources/org/unicode/wikidata/"; + static final String DEFAULT_INFLECTION_FILE_NAME = "inflectional.xml"; + static final String DEFAULT_DICTIONARY_FILE_NAME = "dictionary.lst"; + // Put the rare inflections at the end. + static final Comparator RARITY_AWARE_COMPARATOR = Comparator + .comparing(Inflection::isRareUsage) + .thenComparing(Inflection::compareTo); + + private ParserDefaults() { + } +} diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserOptions.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserOptions.java new file mode 100644 index 00000000..3f557c23 --- /dev/null +++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserOptions.java @@ -0,0 +1,267 @@ +/* + * Copyright 2025 Unicode Incorporated and others. All rights reserved. + * Copyright 2020-2024 Apple Inc. All rights reserved. + */ +package org.unicode.wikidata; + +import com.ibm.icu.util.ULocale; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.EnumMap; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import static org.unicode.wikidata.Grammar.DEFAULTMAP; +import static org.unicode.wikidata.Grammar.TYPEMAP; + +/** + * The options to extract the data from the data source. + */ +final class ParserOptions { + private static final char COLON_SEPARATOR = ':'; + static final String INFLECTIONS_FILE = "--inflections"; + static final String DICTIONARY_FILE = "--dictionary"; + static final String MAP_GRAMMEME = "--map-grammeme"; + static final String ADD_EXTRA_GRAMMEMES = "--add-extra-grammemes"; + static final String EXPAND_GRAMMEMES = "--expand-grammemes"; + static final String INFLECTION_TYPES = "--inflection-types"; + static final String IGNORE_PROPERTY = "--ignore-property"; + static final String INCLUDE_LEMMAS_WITHOUT_WORD = "--include-lemmas-without-words"; + static final String IGNORE_SURFACE_FORM = "--ignore-entries-with-grammemes"; + static final String LANGUAGE_OPT = "--language"; + static final String TIMESTAMP = "--timestamp"; + static final String ADD_DEFAULT_GRAMMEME_FOR_CATEGORY = "--add-default-grammeme-for-category"; + static final String ADD_SOUND = "--add-sound"; + + boolean includeLemmasWithoutWords = false; + boolean debug = false; + final boolean addSound; + + EnumSet posToBeInflected; + TreeMap>, List>>> expandGramemes; + TreeMap> additionalGrammemesDict; + TreeMap> defaultGrammemeForCategory; + TreeMap> claimsToSound; + + ArrayList sourceFilenames; + String inflectionalFilename = ParserDefaults.DEFAULT_INFLECTION_FILE_NAME; + String lexicalDictionaryFilename = ParserDefaults.DEFAULT_DICTIONARY_FILE_NAME; + ArrayList locales = new ArrayList<>(List.of(Locale.ENGLISH.getLanguage())); + List optionsUsedToInvoke = new ArrayList<>(); + + private static void printUsage() { + System.err.println("Usage: ParseLexicon [OPTIONS] [ ...]"); + System.err.println("\nOPTIONS"); + System.err.println(INFLECTIONS_FILE + " \tthe file for the inflectional patterns to be generated, default: inflectional.xml"); + System.err.println(DICTIONARY_FILE + " \tthe file for the lexical dictionary to be generated, default: dictionary.lst"); + System.err.println(ADD_EXTRA_GRAMMEMES + " \tFile containing words with the extra grammemes to be added, provide path relative to tools/dictionary-parser/src/main/resources/org/unicode/wikidata/ (only to be used for a temporary grammeme addition)"); + System.err.println(EXPAND_GRAMMEMES + " grammeme1,grammeme2...:grammeme3,grammeme4...\tWhen the first set of grammemes are matched, add the additional set of grammemes."); + System.err.println(INFLECTION_TYPES + " pos1[,pos2,...]\tthe pos's to be inflected, default: noun"); + System.err.println(MAP_GRAMMEME + " grammeme1,grammeme2\twhen grammeme1 is seen in the source dictionary, use grammeme2 instead of it"); + System.err.println(IGNORE_PROPERTY + " grammeme1[,grammeme2,...]\teach property is considered to be an ignorable property."); + System.err.println(IGNORE_SURFACE_FORM + " type1[,type2,...]\tignore entries with specified grammemes. Default: do not ignore"); + System.err.println(INCLUDE_LEMMAS_WITHOUT_WORD + "\tinclude lemma entries which do not have corresponding word-entry. Default: do not include"); + System.err.println(TIMESTAMP + "\ttimestamp of the latest lexicon used. Default: NONE"); + System.err.println(LANGUAGE_OPT + "\tComma separated list of languages to extract to the lexical dictionary. Default: " + ULocale.ENGLISH.getName()); + System.err.println(ADD_DEFAULT_GRAMMEME_FOR_CATEGORY + "\t[pos=partofSpeech1]category1=grammeme1[,category2=grammeme2.....]\t For each of the provided categories if no grammeme is present then add the default grammeme provided for that category to the word. Only applies for the provided parts of speech if pos= is supplied Default: (NONE)"); + System.err.println(ADD_SOUND + " grammeme1[,grammeme2,...]\tSound properties to check for."); + } + + ParserOptions(String[] args) throws IOException { + posToBeInflected = EnumSet.of(Grammar.PartOfSpeech.NOUN); + additionalGrammemesDict = new TreeMap<>(); + sourceFilenames = new ArrayList<>(); + defaultGrammemeForCategory = new TreeMap<>(); + claimsToSound = new TreeMap<>(); + + for (int i = 0; i < args.length; i++) { + String arg = args[i]; + if (ParserOptions.INFLECTIONS_FILE.equals(arg)) { + inflectionalFilename = args[++i]; + } else if (ParserOptions.DICTIONARY_FILE.equals(arg)) { + lexicalDictionaryFilename = args[++i]; + } else if (ParserOptions.ADD_EXTRA_GRAMMEMES.equals(arg)) { + String additionalGrammemeFilename = args[++i]; + String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + additionalGrammemeFilename).toAbsolutePath().toString(); + try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8))) { + String line; + while ((line = br.readLine()) != null) { + int colonIdx = line.indexOf(COLON_SEPARATOR); + String phrase = line.substring(0, colonIdx); + String grammemes = line.substring(colonIdx + 1).trim(); + additionalGrammemesDict.put(phrase, new TreeSet<>(Arrays.asList(grammemes.split(" ")))); + } + optionsUsedToInvoke.add(ParserOptions.ADD_EXTRA_GRAMMEMES); + optionsUsedToInvoke.add(additionalGrammemeFilename); + } + } else if (ParserOptions.MAP_GRAMMEME.equals(arg)) { + String mapGrammeme = args[++i]; + String[] split = mapGrammeme.split(",", 2); + TYPEMAP.put(split[0], toEnumSet(split[1])); + + optionsUsedToInvoke.add(ParserOptions.MAP_GRAMMEME); + optionsUsedToInvoke.add(mapGrammeme); + } else if (ParserOptions.IGNORE_PROPERTY.equals(arg)) { + String propertySetToIgnore = args[++i]; + setIgnoreProperty(propertySetToIgnore.split(","), Grammar.Ignorable.IGNORABLE_PROPERTY); + optionsUsedToInvoke.add(ParserOptions.IGNORE_PROPERTY); + optionsUsedToInvoke.add(propertySetToIgnore); + } else if (ParserOptions.EXPAND_GRAMMEMES.equals(arg)) { + String mapGrammemes = args[++i]; + String[] split = mapGrammemes.split(":", 2); + var key = toEnumSet(split[0]); + var valueArray = new ArrayList<>(List.of(toEnumSet(split[1]))); + if (expandGramemes == null) { + expandGramemes = new TreeMap<>(GrammemeSetComparator.ENUM_COMPARATOR); + } + expandGramemes.merge(key, valueArray, (oldList, newList) -> { + oldList.addAll(newList); + return oldList; + }); + optionsUsedToInvoke.add(ParserOptions.EXPAND_GRAMMEMES); + optionsUsedToInvoke.add(mapGrammemes); + } else if (ParserOptions.INFLECTION_TYPES.equals(arg)) { + String inflectionTypes = args[++i]; + posToBeInflected.clear(); + + for (String pos : inflectionTypes.split(",")) { + posToBeInflected.add(Grammar.PartOfSpeech.valueOf(pos.toUpperCase())); + } + + optionsUsedToInvoke.add(ParserOptions.INFLECTION_TYPES); + optionsUsedToInvoke.add(inflectionTypes); + } else if (ParserOptions.INCLUDE_LEMMAS_WITHOUT_WORD.equals(arg)) { + includeLemmasWithoutWords = true; + optionsUsedToInvoke.add(ParserOptions.INCLUDE_LEMMAS_WITHOUT_WORD); + } else if (ParserOptions.IGNORE_SURFACE_FORM.equals(arg)) { + String ignoreEntriesWithGrammemesStr = args[++i]; + setIgnoreProperty(ignoreEntriesWithGrammemesStr.split(","), Grammar.Ignorable.IGNORABLE_INFLECTION); + optionsUsedToInvoke.add(ParserOptions.IGNORE_SURFACE_FORM); + optionsUsedToInvoke.add(ignoreEntriesWithGrammemesStr); + } else if (ParserOptions.TIMESTAMP.equals(arg)) { + String timestamp = args[++i]; + optionsUsedToInvoke.add(ParserOptions.TIMESTAMP); + optionsUsedToInvoke.add(timestamp); + } else if (ParserOptions.LANGUAGE_OPT.equals(arg)) { + String localeStr = args[++i]; + locales.clear(); + locales.addAll(List.of(localeStr.split(","))); + optionsUsedToInvoke.add(ParserOptions.LANGUAGE_OPT); + optionsUsedToInvoke.add(localeStr); + } else if (ParserOptions.ADD_DEFAULT_GRAMMEME_FOR_CATEGORY.equals(arg)) { + String categoryDefaultGrammemeString = args[++i]; + String[] tokens = categoryDefaultGrammemeString.split(","); + String posValue = ""; + for (int idx = 0; idx < tokens.length; idx += 1) { + String token = tokens[idx]; + String[] tokenArgs = token.split("="); + if (tokenArgs.length != 2) { + throw new IllegalArgumentException("Default Grammeme for category string does not have entry in the format a=b " + token); + } + String key = tokenArgs[0].toLowerCase(); + String value = tokenArgs[1].toLowerCase(); + if (key.compareTo("pos") == 0) { + if (idx != 0) { + throw new IllegalArgumentException("pos key is not the first argument for default Grammeme for category string " + categoryDefaultGrammemeString); + } + posValue = value; + continue; + } + defaultGrammemeForCategory.putIfAbsent(posValue, new TreeMap<>()); + defaultGrammemeForCategory.get(posValue).put(key, value); + } + + optionsUsedToInvoke.add(ParserOptions.ADD_DEFAULT_GRAMMEME_FOR_CATEGORY); + optionsUsedToInvoke.add(categoryDefaultGrammemeString); + } else if (ParserOptions.ADD_SOUND.equals(arg)) { + String soundGrammemeTypes = args[++i]; + + List additionalSoundProperties = Arrays.asList(soundGrammemeTypes.split(",")); + + for (String claimID : ParseWikidata.PROPERTIES_WITH_PRONUNCIATION) { + Properties soundRegexes = new Properties(); + String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + claimID + ".properties").toAbsolutePath().toString(); + try (var propertiesStream = new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8)) { + soundRegexes.load(propertiesStream); + var enumMap = new EnumMap(Grammar.Sound.class); + for (var entry : soundRegexes.entrySet()) { + var key = (String) entry.getKey(); + if (additionalSoundProperties.contains(key)) { + enumMap.put(Grammar.Sound.valueOf(key.toUpperCase(Locale.ROOT).replace('-', '_')), Pattern.compile((String) entry.getValue())); + } + } + if (enumMap.size() != additionalSoundProperties.size()) { + throw new IllegalArgumentException("Not all sound properties were found"); + } + claimsToSound.put(claimID, enumMap); + } + } + + optionsUsedToInvoke.add(ParserOptions.ADD_SOUND); + optionsUsedToInvoke.add(soundGrammemeTypes); + } else { + sourceFilenames.add(arg); + } + } + + addSound = !claimsToSound.isEmpty(); + + if (sourceFilenames.isEmpty()) { + printUsage(); + throw new IllegalArgumentException(); + } + } + + void setIgnoreProperty(String[] grammemes, Grammar.Ignorable ignorable) { + var ignorableSet = EnumSet.of(ignorable); + for (String grammeme : grammemes) { + if (grammeme.matches("Q\\d*")) { + TYPEMAP.put(grammeme, ignorableSet); + } else { + for (Map.Entry>> entry : TYPEMAP.entrySet()) { + for (var grammemeEnum : entry.getValue()) { + String name = grammemeEnum.name(); + if (name.equalsIgnoreCase(grammeme)) { + if (entry.getValue().size() == 1) { + entry.setValue(ignorableSet); + } else { + entry.getValue().remove(grammemeEnum); + ArrayList> clone = new ArrayList<>(entry.getValue()); + clone.add(ignorable); + entry.setValue(new HashSet<>(clone)); + } + break; + } + } + } + } + } + } + + TreeSet> toEnumSet(String grammemes) { + TreeSet> grammemeSet = new TreeSet<>(Inflection.ENUM_COMPARATOR); + for (var grammeme : grammemes.split(",")) { + var grammemeEnum = DEFAULTMAP.get(grammeme); + if (grammemeEnum == null) { + throw new NullPointerException(grammeme + " is not a valid grammeme"); + } + grammemeSet.add(grammemeEnum); + } + return grammemeSet; + } +} diff --git a/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_en.properties b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_en.properties new file mode 100644 index 00000000..20387004 --- /dev/null +++ b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_en.properties @@ -0,0 +1,18 @@ +# Copyright 2025 Unicode Incorporated and others. All rights reserved. +# +# These are lexemes that should either be ignored due to irrelevance that can't be easily tagged as irrelevant, +# or words that are just not that common that should be sorted last in the inflection patterns. +L15388=rare +L299075=omit +L342586=omit +L468896=omit +L469033=omit +L469036=omit +L469037=omit +L469040=omit +L469047=omit +L684798=omit +L685028=omit +L685030=omit +L984169=omit +L1321935=omit