diff --git a/inflection/resources/org/unicode/inflection/dictionary/.gitattributes b/inflection/resources/org/unicode/inflection/dictionary/.gitattributes
index 7e2fe358..bff5d4bc 100644
--- a/inflection/resources/org/unicode/inflection/dictionary/.gitattributes
+++ b/inflection/resources/org/unicode/inflection/dictionary/.gitattributes
@@ -1,2 +1,4 @@
dictionary_da.lst filter=lfs diff=lfs merge=lfs -text
+dictionary_en.lst filter=lfs diff=lfs merge=lfs -text
inflectional_da.xml filter=lfs diff=lfs merge=lfs -text
+inflectional_en.xml filter=lfs diff=lfs merge=lfs -text
diff --git a/inflection/resources/org/unicode/inflection/dictionary/dictionary_en.lst b/inflection/resources/org/unicode/inflection/dictionary/dictionary_en.lst
index 9d149220..f761f0f9 100644
--- a/inflection/resources/org/unicode/inflection/dictionary/dictionary_en.lst
+++ b/inflection/resources/org/unicode/inflection/dictionary/dictionary_en.lst
@@ -1,100 +1,3 @@
-Joses: plural proper-noun inflection=2
-Paris: singular plural proper-noun inflection=2 inflection=b
-United States: singular proper-noun inflection=8 inflection=2e5
-a: determiner inflection=26
-an: determiner inflection=26
-animal: singular adjective noun inflection=1
-apple: singular adjective noun inflection=1
-apples: plural noun inflection=1
-are: singular plural first second third present noun verb inflection=1b0 inflection=1
-around: adposition
-bean: singular plural first second third present infinitive noun verb inflection=1 inflection=3
-beans: singular plural third present noun verb inflection=1 inflection=3
-between: adposition
-boy: singular interjection noun inflection=1
-boys: plural noun inflection=1
-cat: singular plural first second third present infinitive noun verb inflection=1 inflection=1c9
-cats: singular plural third present noun verb inflection=1 inflection=1c9
-church: singular plural first second third present infinitive noun verb inflection=7 inflection=9
-churches: singular plural third present noun verb inflection=7 inflection=9
-cities: plural noun inflection=6
-city: singular adjective noun inflection=6
-create: singular plural first second third present infinitive verb inflection=4
-creates: singular third present verb inflection=4
-day: singular noun inflection=1
-days: plural noun inflection=1
-fan: singular plural first second third present infinitive noun verb inflection=1 inflection=1d
-fans: singular plural third present noun verb inflection=1 inflection=1d
-flock: singular plural first second third present infinitive noun verb inflection=1 inflection=3
-friend: singular plural first second third present infinitive noun verb inflection=1 inflection=3
-friends: singular plural third present noun verb inflection=1 inflection=3
-garbage: singular noun inflection=1
-garbages: plural noun inflection=1
-garden: singular plural first second third present infinitive noun verb inflection=1 inflection=3
-geese: plural noun inflection=f4
-glutei maximi: plural noun inflection=17f
-glutei: plural noun inflection=15
-gluteus maximus: singular noun inflection=17f
-gluteus: singular noun inflection=15
-good: singular adjective noun inflection=1
-goods: plural noun inflection=1
-goose: singular plural first second third present infinitive noun verb inflection=4 inflection=f4
-has: singular third present verb inflection=fa
-have: singular plural first second third present infinitive noun verb inflection=1 inflection=fa
-head: singular plural first second third present infinitive adjective noun verb inflection=1 inflection=3
-heads: singular plural third present noun verb inflection=1 inflection=3
-hour: singular vowel-start noun inflection=1
-hours: plural vowel-start noun inflection=1
-houses: singular plural third present noun verb inflection=1 inflection=4
-is: singular third present verb inflection=1b0
-it: singular inanimate accusative nominative third pronoun
-kidney: singular noun inflection=1
-leading: singular gerund adjective noun verb inflection=1 inflection=83
-light: singular plural first second third present infinitive adjective noun verb inflection=1 inflection=9e
-lights: singular plural third present noun verb inflection=1 inflection=9e
-man: singular plural first second third present infinitive interjection noun verb inflection=c inflection=1d
-men: plural noun inflection=c
-mice: plural noun inflection=63
-mouse: singular plural first second third present infinitive noun verb inflection=4 inflection=63
-noun: singular noun inflection=1
-nouns: plural noun inflection=1
-of: adposition
-on: adverb adposition
-orange: singular adjective noun inflection=1
-patio: singular noun inflection=1
-patios: plural noun inflection=1
-phrase: singular plural first second third present infinitive noun verb inflection=1 inflection=4
-phrases: singular plural third present noun verb inflection=1 inflection=4
-pie: singular noun inflection=1
-pies: plural noun inflection=1
-plural: singular adjective noun inflection=1
-plurals: plural noun inflection=1
-red: singular adjective noun inflection=1
-sag: singular plural first second third present infinitive noun verb inflection=1 inflection=14
-sags: singular plural third present noun verb inflection=1 inflection=14
-sheep: singular plural noun inflection=a
-sister: singular noun inflection=1 inflection=33b
-sisters: plural noun inflection=1 inflection=33b
-spatula: singular noun inflection=1
-test: singular plural first second third present infinitive noun verb inflection=1 inflection=3
-tests: singular plural third present noun verb inflection=1 inflection=3
-that: singular inanimate demonstrative determiner pronoun inflection=18b
-the: determiner inflection=26
-theories: plural noun inflection=6
-theory: singular noun inflection=6
-these: plural inanimate demonstrative determiner pronoun inflection=15d
-thesis: singular noun inflection=10
-this: singular inanimate demonstrative determiner pronoun inflection=15d
-those: plural inanimate demonstrative determiner pronoun inflection=18b
-to: adposition
-truss: singular plural first second third present infinitive noun verb inflection=7 inflection=9
-trusses: singular plural third present noun verb inflection=7 inflection=9
-umbrella: singular noun inflection=1
-unicorn: singular consonant-start noun inflection=1
-word: singular plural first second third present infinitive noun verb inflection=1 inflection=3
-work: singular plural first second third present infinitive noun verb inflection=1 inflection=3
-works: singular plural third present noun verb inflection=1 inflection=3
-yen: singular plural noun inflection=a
-==============================================
-Manually curated for tests to pass
-Copyright 2024-2024 Apple Inc. All rights reserved.
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b78d5a694d0d6e301a530687fb4bd017563a3a30e2f60c1decd4cc8e06c413f
+size 4946201
diff --git a/inflection/resources/org/unicode/inflection/dictionary/inflectional_en.xml b/inflection/resources/org/unicode/inflection/dictionary/inflectional_en.xml
index ea52e2df..e5e439e8 100644
--- a/inflection/resources/org/unicode/inflection/dictionary/inflectional_en.xml
+++ b/inflection/resources/org/unicode/inflection/dictionary/inflectional_en.xml
@@ -1,324 +1,3 @@
-
-
-
-
- noun
-
-
-
- s
-
-
-
- proper-noun
-
-
-
- s
-
-
-
- verb
-
-
-
-
- s
-
-
-
- ed
- ed
- ed
-
- ing
-
-
-
- verb
- e
-
- e
- e
- es
- e
- e
- e
- ed
- ed
- ed
- e
- ing
-
-
-
- noun
- y
-
- y
- ies
-
-
-
- noun
-
-
-
- es
-
-
-
- proper-noun
-
-
-
-
-
-
- verb
-
-
-
-
- es
-
-
-
- ed
- ed
- ed
-
- ing
-
-
-
- noun
-
-
-
-
-
-
-
- proper-noun
-
-
-
- es
-
-
-
- noun
- an
-
- an
- en
-
-
-
- noun
- is
-
- is
- es
-
-
-
- verb
-
-
-
-
- s
-
-
-
- ged
- ged
- ged
-
- ging
-
-
-
- noun
- us
-
- us
- i
-
-
-
- verb
-
-
-
-
- s
-
-
-
- ned
- ned
- ned
-
- ning
-
-
-
- determiner
-
-
- noun
- ouse
-
- ouse
- ice
-
-
-
- verb
- ad
-
- ad
- ad
- ads
- ad
- ad
- ad
- d
- d
- d
- ad
- ading
-
-
-
- verb
- ght
-
- ght
- ght
- ghts
- ght
- ght
- ght
- t
- t
- t
- ght
- ghting
-
-
-
- noun
- oose
-
- oose
- eese
-
-
-
- verb
- ve
-
- ve
- ve
- s
- ve
- ve
- ve
- d
- d
- d
- ve
- ving
- st
- th
- st
- st
- st
-
-
-
- determiner
- is
-
- is
- ese
-
-
-
- noun
- us maximus
-
- us maximus
- i maximi
-
-
-
- determiner
- at
-
- at
- ose
-
-
-
- verb
- be
-
- am
- are
- is
- are
- are
- are
- was
- wast
- were
- been
- be
- being
- wert
-
-
-
- verb
-
-
-
-
- s
-
-
-
- ted
- ted
- ted
-
- tin'
- ting
-
-
-
- proper-noun
- of America
-
-
- of America
- of Americas
-
-
-
- noun
- er
-
- a
- er
- ers
-
-
-
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7aab928cb118a6581ed6e0908a00d782ad565cfb539734baa9e37c0e22a797f
+size 516358
diff --git a/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.cpp b/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.cpp
index f421b4c6..982f875d 100644
--- a/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.cpp
+++ b/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.cpp
@@ -1,4 +1,5 @@
/*
+ * Copyright 2025 Unicode Incorporated and others. All rights reserved.
* Copyright 2017-2024 Apple Inc. All rights reserved.
*/
#include
@@ -86,7 +87,8 @@ ::inflection::dialog::DisplayValue* EnGrammarSynthesizer_EnDisplayFunction::getD
}
auto displayValueConstraints(constraints);
::std::u16string countString = GrammarSynthesizerUtil::getFeatureValue(constraints, countFeature);
- if (countString == GrammemeConstants::NUMBER_PLURAL() || countString == GrammemeConstants::NUMBER_SINGULAR()) {
+ bool isRequestingPlural = countString == GrammemeConstants::NUMBER_PLURAL();
+ if (isRequestingPlural || countString == GrammemeConstants::NUMBER_SINGULAR()) {
auto result = inflectPhrase(displayString, constraints, enableInflectionGuess);
if (!result && !enableInflectionGuess) {
return nullptr;
@@ -102,7 +104,7 @@ ::inflection::dialog::DisplayValue* EnGrammarSynthesizer_EnDisplayFunction::getD
::std::u16string caseString = GrammarSynthesizerUtil::getFeatureValue(constraints, caseFeature);
if (caseString == GrammemeConstants::CASE_GENITIVE()) {
- displayString = inflectPossessive(displayString, displayValueConstraints);
+ displayString = inflectPossessive(displayString, displayValueConstraints, isRequestingPlural);
}
return definitenessDisplayFunction.addDefiniteness(new ::inflection::dialog::DisplayValue(displayString, displayValueConstraints), constraints);
@@ -200,7 +202,7 @@ ::std::u16string EnGrammarSynthesizer_EnDisplayFunction::guessSingularInflection
return displayString;
}
-::std::u16string EnGrammarSynthesizer_EnDisplayFunction::inflectPossessive(const ::std::u16string& displayString, ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& valueConstraints) const
+::std::u16string EnGrammarSynthesizer_EnDisplayFunction::inflectPossessive(const ::std::u16string& displayString, ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string>& valueConstraints, bool isRequestingPlural) const
{
::std::u16string lowercase;
::inflection::util::StringViewUtils::lowercase(&lowercase, displayString, ::inflection::util::LocaleUtils::ENGLISH());
@@ -218,7 +220,7 @@ ::std::u16string EnGrammarSynthesizer_EnDisplayFunction::inflectPossessive(const
else {
if (::inflection::util::StringViewUtils::endsWith(suffix, u"s")) {
::std::unique_ptr<::inflection::tokenizer::TokenChain> tokenChain(npc(npc(tokenizer.get())->createTokenChain(displayString)));
- if (dictionary.hasAllProperties(npc(npc(tokenChain->getTail())->getPrevious())->getValue(), pluralProperty)) {
+ if (isRequestingPlural || dictionary.hasAllProperties(npc(npc(tokenChain->getTail())->getPrevious())->getValue(), pluralProperty)) {
suffixStr = u"’";
}
}
diff --git a/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.hpp b/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.hpp
index 771d99ac..f70a0af2 100644
--- a/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.hpp
+++ b/inflection/src/inflection/grammar/synthesis/EnGrammarSynthesizer_EnDisplayFunction.hpp
@@ -1,4 +1,5 @@
/*
+ * Copyright 2025 Unicode Incorporated and others. All rights reserved.
* Copyright 2017-2024 Apple Inc. All rights reserved.
*/
#pragma once
@@ -42,7 +43,7 @@ class inflection::grammar::synthesis::EnGrammarSynthesizer_EnDisplayFunction
private:
::std::u16string guessPluralInflection(const ::std::u16string& displayString) const;
::std::u16string guessSingularInflection(const ::std::u16string& displayString) const;
- ::std::u16string inflectPossessive(const std::u16string &displayString, std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &valueConstraints) const;
+ ::std::u16string inflectPossessive(const std::u16string &displayString, std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &valueConstraints, bool isRequestingPlural) const;
::std::optional<::std::u16string> inflectPhrase(const std::u16string &originalString, const ::std::map<::inflection::dialog::SemanticFeature, ::std::u16string> &constraints, bool enableInflectionGuess) const;
public:
diff --git a/inflection/test/src/inflection/dictionary/DictionaryMetaDataTest.cpp b/inflection/test/src/inflection/dictionary/DictionaryMetaDataTest.cpp
index a5f4270a..bf53c759 100644
--- a/inflection/test/src/inflection/dictionary/DictionaryMetaDataTest.cpp
+++ b/inflection/test/src/inflection/dictionary/DictionaryMetaDataTest.cpp
@@ -1,4 +1,5 @@
/*
+ * Copyright 2025 Unicode Incorporated and others. All rights reserved.
* Copyright 2016-2024 Apple Inc. All rights reserved.
*/
#include "catch2/catch_test_macros.hpp"
@@ -33,7 +34,6 @@ TEST_CASE("DictionaryMetaDataTest#testEnglish")
REQUIRE_FALSE(npc(dictionary)->getPropertyValues(u"man", u"inflection").empty());
REQUIRE_FALSE(npc(dictionary)->getPropertyValues(u"theories", u"inflection").empty());
REQUIRE_FALSE(npc(dictionary)->getPropertyValues(u"theory", u"inflection").empty());
- REQUIRE_FALSE(npc(dictionary)->getPropertyValues(u"United States", u"inflection").empty());
REQUIRE(npc(dictionary)->hasProperty(u"Paris", u"proper-noun"));
REQUIRE(npc(dictionary)->hasProperty(u"paris", u"proper-noun"));
int64_t properties = 0;
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/DocumentState.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/DocumentState.java
new file mode 100644
index 00000000..615d156f
--- /dev/null
+++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/DocumentState.java
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2025 Unicode Incorporated and others. All rights reserved.
+ * Copyright 2020-2024 Apple Inc. All rights reserved.
+ */
+package org.unicode.wikidata;
+
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.nio.charset.StandardCharsets;
+import java.text.NumberFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.regex.Pattern;
+
+/**
+ * Contains statistical information on what has been analyzed.
+ */
+final class DocumentState {
+ int lemmaCount = 0;
+ int unusableLemmaCount = 0;
+ int unusableSurfaceFormCount = 0;
+ int mergedCount = 0;
+ int incomingSurfaceForm = 0;
+ TreeMap dictionary = new TreeMap<>();
+ ArrayList inflectionPatterns = new ArrayList<>(1024);
+
+ boolean isInflectional() {
+ return inflectionPatterns.size() > 1 || (inflectionPatterns.size() == 1 && inflectionPatterns.get(0).getCount() > 1);
+ }
+
+ private void sortInflectionPatterns(ArrayList inflectionPatterns) {
+ // We are sorting for the common ones first and then compare the identifier for lack of a better
+ inflectionPatterns.sort(Comparator
+ .comparing(InflectionPattern::getCount)
+ .reversed()
+ .thenComparing(InflectionPattern::getID));
+ int identifierEnumeration = 1;
+ for (InflectionPattern inflectionPattern : inflectionPatterns) {
+ inflectionPattern.setID(identifierEnumeration++); // This is where we are reassigning identifiers to their new values.
+ }
+ }
+
+ public void addDictionaryEntry(DictionaryEntry dictionaryEntry) {
+ String phrase = dictionaryEntry.phrase;
+ DictionaryEntry existingDictionaryEntry = dictionary.get(phrase);
+ if (existingDictionaryEntry == null) {
+ dictionary.put(phrase, dictionaryEntry);
+ } else {
+ mergedCount++;
+ existingDictionaryEntry.merge(dictionaryEntry);
+ }
+ }
+
+ void printDocument(ParserOptions parserOptions, long startTime) throws FileNotFoundException {
+ TreeMap, Integer> grammemeCounts = new TreeMap<>(EnumComparator.ENUM_COMPARATOR);
+ int unclassifiedTerms = 0;
+ if (isInflectional()) {
+ try (PrintWriter inflectionalStream = new PrintWriter(new OutputStreamWriter(
+ new FileOutputStream(parserOptions.inflectionalFilename), StandardCharsets.UTF_8))) {
+ inflectionalStream.println("\n" +
+ "");
+
+ sortInflectionPatterns(inflectionPatterns);
+ for (InflectionPattern inflectionPattern : inflectionPatterns) {
+ inflectionalStream.print(inflectionPattern);
+ }
+ inflectionalStream.println("");
+ }
+ }
+ try (PrintWriter lexicalDictionaryStream = new PrintWriter(new OutputStreamWriter(
+ new FileOutputStream(parserOptions.lexicalDictionaryFilename), StandardCharsets.UTF_8))) {
+ for (Map.Entry entry : dictionary.entrySet()) {
+ DictionaryEntry dictionaryEntry = entry.getValue();
+ if (dictionaryEntry.getGrammemes().isEmpty()) {
+ // We don't care about only known words. We need grammeme data
+ unclassifiedTerms++;
+ continue;
+ }
+ // Print the dictionary entry to the .lst file.
+ lexicalDictionaryStream.println(dictionaryEntry.toString(isInflectional()));
+ for (Enum> grammeme : dictionaryEntry.getGrammemes()) {
+ grammemeCounts.merge(grammeme, 1, Integer::sum);
+ }
+ }
+
+ NumberFormat percentFormat = NumberFormat.getPercentInstance(Locale.US);
+ percentFormat.setMaximumFractionDigits(1);
+ int dictionarySize = dictionary.size();
+ StringBuilder source = new StringBuilder();
+ Pattern anythingSlash = Pattern.compile(".*/");
+ for (String sourceFilename : parserOptions.sourceFilenames) {
+ source.append(anythingSlash.matcher(sourceFilename).replaceAll("")).append(" ");
+ }
+ lexicalDictionaryStream.println("==============================================");
+ lexicalDictionaryStream.printf("%30s %7s%n", "Source:", source);
+ lexicalDictionaryStream.printf("%30s %7d%n", "Lemma terms:", lemmaCount);
+ lexicalDictionaryStream.printf("%30s %7d%n", "Unusable lemma terms:", unusableLemmaCount);
+ lexicalDictionaryStream.printf("%30s %7d%n", "Incoming surface forms:", incomingSurfaceForm);
+ lexicalDictionaryStream.printf("%30s %7d%n", "Surface forms:", dictionarySize);
+ lexicalDictionaryStream.printf("%30s %7d %7s%n", "Collapsed surface forms:", mergedCount, '(' + percentFormat.format((mergedCount) / (double) incomingSurfaceForm) + ')');
+ lexicalDictionaryStream.printf("%30s %7d%n", "Unusable surface forms:", unusableSurfaceFormCount);
+ lexicalDictionaryStream.printf("%30s %7d %7s%n", "Usable terms:", dictionarySize - unclassifiedTerms, '(' + percentFormat.format((dictionarySize - unclassifiedTerms) / (double) dictionarySize) + ')');
+ lexicalDictionaryStream.printf("%30s %7d %7s%n", "Unclassified terms:", unclassifiedTerms, '(' + percentFormat.format(unclassifiedTerms / (double) dictionarySize) + ')');
+ lexicalDictionaryStream.println("==============================================");
+ TreeMap>> categories = new TreeMap<>();
+ for (var entry : grammemeCounts.entrySet()) {
+ var entryCategory = entry.getKey().getClass().getSimpleName();
+ if (!categories.containsKey(entryCategory)) {
+ categories.put(entryCategory, new ArrayList<>());
+ }
+ var categoryValues = categories.get(entryCategory);
+ categoryValues.add(entry.getKey());
+ }
+
+ for (var categoryEntry : categories.entrySet()) {
+ var categoryName = categoryEntry.getKey();
+ lexicalDictionaryStream.printf("%s:%n", categoryName);
+ var categoryValues = categoryEntry.getValue();
+ categoryValues.sort(Comparator.comparing(grammemeCounts::get));
+ Collections.reverse(categoryValues);
+ for (var categoryValue : categoryValues) {
+ lexicalDictionaryStream.printf(" %-20s %7d %7s%n", categoryValue.toString() + ':', grammemeCounts.get(categoryValue), '(' + percentFormat.format(grammemeCounts.get(categoryValue) / (double) dictionarySize) + ')');
+ }
+ lexicalDictionaryStream.printf("%n");
+ }
+ long endTime = System.currentTimeMillis();
+ long elapsedTime = (endTime - startTime);
+ lexicalDictionaryStream.println("processed in " + (elapsedTime / 1000) + '.' + (elapsedTime % 1000) + " seconds");
+ lexicalDictionaryStream.println("License: Creative Commons CC0 License (https://creativecommons.org/publicdomain/zero/1.0/)");
+ lexicalDictionaryStream.println("generated with options: " + String.join(" ", parserOptions.optionsUsedToInvoke));
+ }
+ }
+
+ DocumentState() {
+ }
+}
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java
index 631ee973..4e335023 100644
--- a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java
+++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java
@@ -233,8 +233,7 @@ public String toString() {
}
enum FormType {
- SHORT_FORM,
- IRREGULAR;
+ SHORT_FORM;
private final String printableValue;
FormType() {
@@ -491,6 +490,12 @@ enum Sound {
RIEUL_END,
VOWEL_START,
VOWEL_END,
+ BACK_ROUND,
+ BACK_UNROUND,
+ FRONT_ROUND,
+ FRONT_UNROUND,
+ HARD_CONSONANT,
+ SOFT_CONSONANT,
;
private final String printableValue;
@@ -509,6 +514,7 @@ enum Register {
FAMILIAR,
FORMAL,
HIGH,
+ PEJORATIVE, // pejorative (or derogatory) A word form expressing a negative or belittling attitude towards the person or thing referred to
INFORMAL,
INTIMATE,
LITERARY;
@@ -538,8 +544,17 @@ public String toString() {
}
}
+ static final Map> DEFAULTMAP = new HashMap<>(1021);
+
+ static {
+ for (var enumClass : Grammar.class.getDeclaredClasses()) {
+ for (var enumValue : enumClass.getEnumConstants()) {
+ DEFAULTMAP.put(enumValue.toString(), (Enum>)enumValue);
+ }
+ }
+ }
+
static final Map>> TYPEMAP = new HashMap<>(1021);
- static final Map REMAP = new HashMap<>(1021);
static {
@@ -553,6 +568,7 @@ public String toString() {
TYPEMAP.put("Q918270", EnumSet.of(PartOfSpeech.ABBREVIATION)); // initalism
TYPEMAP.put("Q30619513", EnumSet.of(PartOfSpeech.ABBREVIATION)); // USPS abbreviation
TYPEMAP.put("Q126473", EnumSet.of(PartOfSpeech.ABBREVIATION)); // contraction
+ TYPEMAP.put("Q1130279", EnumSet.of(PartOfSpeech.ABBREVIATION)); // hypocorism, short nickname
TYPEMAP.put("Q34698", EnumSet.of(PartOfSpeech.ADJECTIVE));
TYPEMAP.put("Q12259986", EnumSet.of(PartOfSpeech.ADJECTIVE)); // prenominal adjective
TYPEMAP.put("Q7233569", EnumSet.of(PartOfSpeech.ADJECTIVE)); // postpositive adjective
@@ -560,6 +576,7 @@ public String toString() {
TYPEMAP.put("Q1091269", EnumSet.of(PartOfSpeech.ADJECTIVE)); // na-adjective in Japanese
TYPEMAP.put("Q7250170", EnumSet.of(PartOfSpeech.ADJECTIVE)); // proper adjective, the adjective form of a proper noun
TYPEMAP.put("Q332375", EnumSet.of(PartOfSpeech.ADJECTIVE)); // absolute adjective (uncomparable adjective)
+ TYPEMAP.put("Q3618903", EnumSet.of(PartOfSpeech.ADJECTIVE)); // indefinite adjective
TYPEMAP.put("Q380057", EnumSet.of(PartOfSpeech.ADVERB));
TYPEMAP.put("Q1668170", EnumSet.of(PartOfSpeech.INTERROGATIVE, PartOfSpeech.ADVERB));
TYPEMAP.put("Q1522423", EnumSet.of(PartOfSpeech.ADVERB)); // locative adverb, but we don't need that much precision about the type.
@@ -573,6 +590,7 @@ public String toString() {
TYPEMAP.put("Q28833099", EnumSet.of(PartOfSpeech.CONJUNCTION)); // coordinating conjunction
TYPEMAP.put("Q576271", EnumSet.of(PartOfSpeech.DETERMINER));
TYPEMAP.put("Q5051", new HashSet<>(Arrays.asList(Case.GENITIVE, PartOfSpeech.DETERMINER))); // possessive determiner
+ TYPEMAP.put("Q2824480", EnumSet.of(PartOfSpeech.DETERMINER)); // demonstrative adjective, but it's really a determiner.
TYPEMAP.put("Q83034", EnumSet.of(PartOfSpeech.INTERJECTION));
TYPEMAP.put("Q2304610", EnumSet.of(PartOfSpeech.INTERROGATIVE));
TYPEMAP.put("Q12021746", EnumSet.of(PartOfSpeech.INTERROGATIVE));
@@ -598,9 +616,11 @@ public String toString() {
TYPEMAP.put("Q115762248", EnumSet.of(PartOfSpeech.PARTICLE)); // vocative particle
TYPEMAP.put("Q113076880", EnumSet.of(PartOfSpeech.ADVERB)); // postpositive adverb
TYPEMAP.put("Q65807752", EnumSet.of(PartOfSpeech.ADVERB)); // demonstrative adverb
+ TYPEMAP.put("Q134316", EnumSet.of(PartOfSpeech.ADPOSITION)); // adposition
TYPEMAP.put("Q161873", EnumSet.of(PartOfSpeech.ADPOSITION)); // postposition
TYPEMAP.put("Q4833830", EnumSet.of(PartOfSpeech.ADPOSITION)); // preposition
TYPEMAP.put("Q36224", EnumSet.of(PartOfSpeech.PRONOUN));
+ TYPEMAP.put("Q2006180", EnumSet.of(PartOfSpeech.PRONOUN)); // pro-form, word that substitutes for another word, broader scope than pronoun
TYPEMAP.put("Q147276", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // proper noun
TYPEMAP.put("Q7884789", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // toponym
TYPEMAP.put("Q43229", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // organization
@@ -611,10 +631,12 @@ public String toString() {
// TYPEMAP.put("Q1350145", EnumSet.of(PartOfSpeech.VERB, PartOfSpeech.NOUN)); // verbal noun, like boxing
TYPEMAP.put("Q11399805", EnumSet.of(PartOfSpeech.VERB)); // auxiliary verb
TYPEMAP.put("Q131431824", EnumSet.of(PartOfSpeech.VERB)); // proper verb where you use a proper noun as a verb
+ TYPEMAP.put("Q3254028", EnumSet.of(PartOfSpeech.VERB)); // separable verb, verb with a prefix which separates from the core verb in certain positions in a sentence
TYPEMAP.put("Q4239848", new HashSet<>(Arrays.asList(FormType.SHORT_FORM, PartOfSpeech.ADJECTIVE))); // short form of an adjective
- TYPEMAP.put("short-form", EnumSet.of(FormType.SHORT_FORM));
- TYPEMAP.put("irregular", EnumSet.of(FormType.IRREGULAR));
+ TYPEMAP.put("Q112154", EnumSet.of(FormType.SHORT_FORM)); // apocope, loss of word-final sounds
+ TYPEMAP.put("Q650250", EnumSet.of(FormType.SHORT_FORM)); // elision, omission of one or more sounds in a word
+ TYPEMAP.put("Q114092330", EnumSet.of(FormType.SHORT_FORM)); // prevocalic form, linguistic feature marking a linguistic unit as appearing only before vowels
TYPEMAP.put("Q109267112", EnumSet.of(Polarity.AFFIRMATIVE));
TYPEMAP.put("Q1478451", EnumSet.of(Polarity.NEGATIVE));
@@ -647,13 +669,17 @@ public String toString() {
TYPEMAP.put("Q53998049", EnumSet.of(Count.UNCOUNTABLE)); // indefinite number, neither singular nor plural, uncountable. Unmarked appears in declension when it is not necessary to specify singular or plural, such as because it is a proper name or is next to a determiner or a quantifier.
TYPEMAP.put("stressed", EnumSet.of(Emphasis.STRESSED));
+ TYPEMAP.put("Q55464002", EnumSet.of(Emphasis.STRESSED)); // strong form
TYPEMAP.put("unstressed", EnumSet.of(Emphasis.UNSTRESSED));
+ TYPEMAP.put("Q55464014", EnumSet.of(Emphasis.UNSTRESSED)); // weak form
TYPEMAP.put("Q499327", EnumSet.of(Gender.MASCULINE));
TYPEMAP.put("Q54020116", new HashSet<>(Arrays.asList(Gender.MASCULINE, Animacy.ANIMATE)));
TYPEMAP.put("Q52943434", new HashSet<>(Arrays.asList(Gender.MASCULINE, Animacy.INANIMATE)));
TYPEMAP.put("Q27918551", new HashSet<>(Arrays.asList(Gender.MASCULINE, Animacy.HUMAN))); // masculine personal
TYPEMAP.put("Q52943193", new HashSet<>(Arrays.asList(Gender.MASCULINE, Animacy.ANIMATE))); // masculine animate non-personal
+ TYPEMAP.put("Q18478758", new HashSet<>(Arrays.asList(Gender.MASCULINE, Gender.FEMININE))); // common of two genders
+ TYPEMAP.put("Q100919075", new HashSet<>(Arrays.asList(Gender.MASCULINE, Gender.FEMININE))); // ambiguous gender
TYPEMAP.put("Q1775415", EnumSet.of(Gender.FEMININE));
TYPEMAP.put("Q1775461", EnumSet.of(Gender.NEUTER));
TYPEMAP.put("Q1305037", EnumSet.of(Gender.COMMON));
@@ -713,12 +739,12 @@ public String toString() {
TYPEMAP.put("Q956030", new HashSet<>(Arrays.asList(Definiteness.INDEFINITE, PartOfSpeech.PRONOUN)));
// TYPEMAP.put(asTreeSet("Q53998049"), EnumSet.of(Definiteness.INDEFINITE)); // indefinite number
TYPEMAP.put("Q10265745", new HashSet<>(Arrays.asList(Definiteness.DEMONSTRATIVE, PartOfSpeech.DETERMINER))); // demonstrative determiner
+ TYPEMAP.put("Q79377486", new HashSet<>(Arrays.asList(Definiteness.DEMONSTRATIVE, PartOfSpeech.DETERMINER))); // distal, demonstrative
TYPEMAP.put("Q10345583", new HashSet<>(Arrays.asList(Tense.PRESENT, VerbType.PARTICIPLE)));
TYPEMAP.put("Q1230649", new HashSet<>(Arrays.asList(Tense.PAST, VerbType.PARTICIPLE)));
TYPEMAP.put("Q72249355", new HashSet<>(Arrays.asList(Voice.ACTIVE, VerbType.PARTICIPLE)));
TYPEMAP.put("Q72249544", new HashSet<>(Arrays.asList(Voice.PASSIVE, VerbType.PARTICIPLE)));
- TYPEMAP.put("Q112785242", new HashSet<>(Arrays.asList(Aspect.IMPERFECT, VerbType.PARTICIPLE))); // imperfect participle
TYPEMAP.put("Q113133303", EnumSet.of(VerbType.PARTICIPLE)); // conjunctive participle
TYPEMAP.put("Q192613", EnumSet.of(Tense.PRESENT)); // present tense
TYPEMAP.put("Q3910936", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.PRESENT))); // simple present and usually future
@@ -769,6 +795,7 @@ public String toString() {
TYPEMAP.put("Q953129", EnumSet.of(PartOfSpeech.PRONOUN)); // reflexive pronoun
TYPEMAP.put("Q130266209", EnumSet.of(PartOfSpeech.PRONOUN)); // reflexive personal pronoun
TYPEMAP.put("Q1050744", EnumSet.of(PartOfSpeech.PRONOUN)); // relative pronoun
+ TYPEMAP.put("Q1462657", EnumSet.of(PartOfSpeech.PRONOUN)); // reciprocal pronoun
TYPEMAP.put("Q625581", EnumSet.of(Mood.CONDITIONAL));
TYPEMAP.put("Q3686414", new HashSet<>(Arrays.asList(Tense.PRESENT, Mood.CONDITIONAL))); // conditional present
@@ -808,7 +835,7 @@ public String toString() {
TYPEMAP.put("Q108524486", EnumSet.of(Aspect.IMPERFECT));
TYPEMAP.put("Q7240943", new HashSet<>(Arrays.asList(Tense.PRESENT, Aspect.IMPERFECT))); // present continuous/present imperfect
TYPEMAP.put("Q56650537", new HashSet<>(Arrays.asList(Tense.PAST, Aspect.IMPERFECT))); // past continuous/present imperfect
- TYPEMAP.put("Q56650537", new HashSet<>(Arrays.asList(Aspect.IMPERFECT, VerbType.PARTICIPLE))); // imperfect participle
+ TYPEMAP.put("Q112785242", new HashSet<>(Arrays.asList(Aspect.IMPERFECT, VerbType.PARTICIPLE))); // imperfect participle
TYPEMAP.put("Q113115936", new HashSet<>(Arrays.asList(Aspect.PERFECT, VerbType.PARTICIPLE))); // perfect participle
TYPEMAP.put("Q623742", EnumSet.of(Aspect.PLUPERFECT));
@@ -823,6 +850,9 @@ public String toString() {
TYPEMAP.put("Q56650485", new HashSet<>(Arrays.asList(Person.SECOND, Register.INFORMAL)));
TYPEMAP.put("Q66664394", EnumSet.of(Register.INTIMATE)); // endearing
TYPEMAP.put("high", EnumSet.of(Register.HIGH));
+ TYPEMAP.put("Q545779", EnumSet.of(Register.PEJORATIVE)); // pejorative
+ TYPEMAP.put("Q54948374", EnumSet.of(Register.PEJORATIVE)); // depreciative form
+ TYPEMAP.put("Q1521634", EnumSet.of(Register.PEJORATIVE)); // vulgarism
TYPEMAP.put("Q75242466", EnumSet.of(Register.CONVERSATIONAL)); // chalita bhasha
TYPEMAP.put("Q55228835", EnumSet.of(Register.CONVERSATIONAL)); // colloquial form
TYPEMAP.put("Q20613396", EnumSet.of(Register.LITERARY)); // historical language style that was used in 19th and 20th century Bangla literary works
@@ -834,6 +864,7 @@ public String toString() {
TYPEMAP.put("Q1358239", EnumSet.of(Sizeness.AUGMENTATIVE));
TYPEMAP.put("Q221446", EnumSet.of(Sizeness.AUGMENTATIVE)); // reduplication in Japanese
+ TYPEMAP.put("Q6029894", EnumSet.of(Sizeness.AUGMENTATIVE)); // intensive
TYPEMAP.put("Q108709", EnumSet.of(Sizeness.DIMINUTIVE));
TYPEMAP.put("consonant-end", EnumSet.of(Sound.CONSONANT_END));
@@ -841,9 +872,6 @@ public String toString() {
TYPEMAP.put("rieul-end", EnumSet.of(Sound.RIEUL_END));
TYPEMAP.put("vowel-end", EnumSet.of(Sound.VOWEL_END));
TYPEMAP.put("vowel-start", EnumSet.of(Sound.VOWEL_START));
-// TYPEMAP.put("Q650250", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // elision, omission of one or more sounds in a word or phrase
-// TYPEMAP.put("Q114092330", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // prevocalic form, linguistic feature marking a linguistic unit as appearing only before vowels
-// TYPEMAP.put("Q112154", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // apocope, loss of word-final sounds
TYPEMAP.put("Q101252532", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // where consonant is unmutated
TYPEMAP.put("Q56648699", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // soft mutation, where consonant becomes more sonorous
TYPEMAP.put("Q117262361", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // pausal form, form of a word realised in hiatus between prosodic units
@@ -856,6 +884,7 @@ public String toString() {
TYPEMAP.put("standard", EnumSet.of(Usage.STANDARD));
TYPEMAP.put("Q55094451", EnumSet.of(Usage.RARE)); // rare form
+ TYPEMAP.put("Q58157328", EnumSet.of(Usage.RARE)); // rare, indicates whether lexeme sense is used rarely
TYPEMAP.put("Q8102", EnumSet.of(Usage.RARE)); // slang
TYPEMAP.put("Q12237354", EnumSet.of(Usage.RARE)); // obsolete word
TYPEMAP.put("Q54943392", EnumSet.of(Usage.RARE)); // obsolete form
@@ -883,6 +912,7 @@ public String toString() {
TYPEMAP.put("Q56042915", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // prepositional phrase
TYPEMAP.put("Q1778442", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // verb phrase
TYPEMAP.put("Q384876", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // set phrase
+ TYPEMAP.put("Q3062294", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // Latin phrase
TYPEMAP.put("Q1527589", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // phrasal verb
TYPEMAP.put("Q117606981", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // verbo-nominal syntagma
TYPEMAP.put("Q12734432", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // attributive locution, phrase that grammatically is used as attribute
@@ -922,9 +952,11 @@ public String toString() {
TYPEMAP.put("Q43249", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // morpheme
TYPEMAP.put("Q126728876", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // nominal modifier, suffix deriving a noun from a preceding noun
TYPEMAP.put("Q126734687", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // verbal modifier, verbal derivational suffix
+ TYPEMAP.put("Q361669", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // modal particle
TYPEMAP.put("Q134830", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // prefix
TYPEMAP.put("Q54792077", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // prefixoid
TYPEMAP.put("Q125858556", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // number-person prefix
+ TYPEMAP.put("Q1552433", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // preverb
TYPEMAP.put("Q62155", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // affix
TYPEMAP.put("Q109249055", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // pseudo-affix
TYPEMAP.put("Q1153504", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // interfix
@@ -949,11 +981,12 @@ public String toString() {
TYPEMAP.put("Q18915698", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // established collocation
TYPEMAP.put("Q1428334", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // paradigm, an inflection table instead of actual words
TYPEMAP.put("Q102500", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // chemical symbol
- TYPEMAP.put("Q80071", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // symbol
+ TYPEMAP.put("Q80071", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // symbol
TYPEMAP.put("Q308229", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // currency sign
TYPEMAP.put("Q31963", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // emoticon
TYPEMAP.put("Q1668151", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // semantic punctuation mark
TYPEMAP.put("Q1984758", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // misspelling, not helpful
+ TYPEMAP.put("Q56161479", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // incorrect form, not helpful
// Types that are algorithmically added instead of stored.
TYPEMAP.put("Q69761768", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // feminine possessive
@@ -981,7 +1014,11 @@ public String toString() {
TYPEMAP.put("Q98772589", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // expanded contraction
TYPEMAP.put("Q1192464", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // rendaku in Japanese
TYPEMAP.put("Q126897884", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // denominal
+ TYPEMAP.put("Q58233068", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // humorous
+ TYPEMAP.put("Q43747", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // Internet slang
+ TYPEMAP.put("Q89522629", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // poetic form
TYPEMAP.put("Q213458", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // clitic
+ TYPEMAP.put("Q6548647", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // enclitic
TYPEMAP.put("Q340015", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // deixis, words requiring context to understand their meaning
TYPEMAP.put("Q162940", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // diacritic
@@ -1057,6 +1094,6 @@ public String toString() {
}
static Set extends Enum>> getMappedGrammemes(String grammeme) {
- return TYPEMAP.get(REMAP.getOrDefault(grammeme, grammeme));
+ return TYPEMAP.get(grammeme);
}
}
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/GrammemeSetComparator.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/GrammemeSetComparator.java
new file mode 100644
index 00000000..5b1dbf07
--- /dev/null
+++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/GrammemeSetComparator.java
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2025 Unicode Incorporated and others. All rights reserved.
+ */
+package org.unicode.wikidata;
+
+import java.util.Comparator;
+import java.util.Set;
+
+public class GrammemeSetComparator implements Comparator>> {
+ @Override
+ public int compare(Set> list1, Set> list2) {
+ var size1 = list1.size();
+ var size2 = list2.size();
+ if (size1 != size2) {
+ return size1 - size2;
+ }
+ var list2Itr = list2.iterator();
+ for (var grammmemeEnum : list1) {
+ var cmpResult = Inflection.ENUM_COMPARATOR.compare(grammmemeEnum, list2Itr.next());
+ if (cmpResult != 0) {
+ return cmpResult;
+ }
+ }
+ return 0;
+ }
+ static final GrammemeSetComparator ENUM_COMPARATOR = new GrammemeSetComparator();
+}
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java
index 94eb9d0a..ca6b62cd 100644
--- a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java
+++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java
@@ -5,20 +5,13 @@
package org.unicode.wikidata;
import java.io.BufferedInputStream;
-import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
+import java.io.IOException;
import java.nio.file.Paths;
-import java.text.NumberFormat;
-import java.util.EnumMap;
-import java.util.Locale;
import java.util.Properties;
-import java.util.regex.Pattern;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
@@ -26,16 +19,13 @@
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
-import com.ibm.icu.util.ULocale;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.lang3.StringUtils;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collections;
-import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.List;
@@ -44,389 +34,10 @@
import java.util.TreeMap;
import java.util.TreeSet;
-import static org.unicode.wikidata.Grammar.TYPEMAP;
-import static org.unicode.wikidata.Grammar.REMAP;
import static org.unicode.wikidata.Grammar.Ignorable;
import static org.unicode.wikidata.Grammar.PartOfSpeech;
import static org.unicode.wikidata.Grammar.Sound;
-/**
- * Default parser option values.
- */
-final class ParserDefaults {
- static final String RESOURCES_DIR = "src/main/resources/org/unicode/wikidata/";
- static final String DEFAULT_INFLECTION_FILE_NAME = "inflectional.xml";
- static final String DEFAULT_DICTIONARY_FILE_NAME = "dictionary.lst";
- // Put the rare inflections at the end.
- static final Comparator RARITY_AWARE_COMPARATOR = Comparator
- .comparing(Inflection::isRareUsage)
- .thenComparing(Inflection::compareTo);
- private ParserDefaults() {}
-}
-
-/**
- * The options to extract the data from the data source.
- */
-final class ParserOptions {
- private static final char COLON_SEPARATOR = ':';
- static final String INFLECTIONS_FILE = "--inflections";
- static final String DICTIONARY_FILE = "--dictionary";
- static final String MAP_GRAMMEME = "--map-grammeme";
- static final String ADD_EXTRA_GRAMMEMES = "--add-extra-grammemes";
- static final String INFLECTION_TYPES = "--inflection-types";
- static final String IGNORE_GRAMMEMES_FOR_TYPES = "--ignore-grammemes-for-types";
- static final String IGNORE_PROPERTY = "--ignore-property";
- static final String INCLUDE_LEMMAS_WITHOUT_WORD = "--include-lemmas-without-words";
- static final String IGNORE_SURFACE_FORM = "--ignore-entries-with-grammemes";
- static final String IGNORE_UNANNOTATED_SURFACE_FORM = "--ignore-unannotated-entries";
- static final String ADD_NORMALIZED_ENTRY = "--add-normalized-entry";
- static final String LANGUAGE_OPT = "--language";
- static final String TIMESTAMP = "--timestamp";
- static final String ADD_DEFAULT_GRAMMEME_FOR_CATEGORY = "--add-default-grammeme-for-category";
- static final String IGNORE_UNSTRUCTURED_ENTRIES = "--ignore-unstructured-entries";
- static final String ADD_SOUND = "--add-sound";
-
- boolean includeLemmasWithoutWords = false;
- boolean ignoreUnannotated = false;
- boolean addNormalizedEntry = false;
- boolean ignoreUnstructuredEntries = false;
- boolean debug = false;
- final boolean addSound;
-
- EnumSet posToBeInflected;
- TreeSet posWithoutGrammemes;
- TreeMap> additionalGrammemesDict;
- TreeMap> defaultGrammemeForCategory;
- TreeMap> claimsToSound;
-
- ArrayList sourceFilenames;
- String inflectionalFilename = ParserDefaults.DEFAULT_INFLECTION_FILE_NAME;
- String lexicalDictionaryFilename = ParserDefaults.DEFAULT_DICTIONARY_FILE_NAME;
- ArrayList locales = new ArrayList<>(List.of(Locale.ENGLISH.getLanguage()));
- List optionsUsedToInvoke = new ArrayList<>();
-
- private static void printUsage() {
- System.err.println("Usage: ParseLexicon [OPTIONS] [ ...]");
- System.err.println("\nOPTIONS");
- System.err.println(INFLECTIONS_FILE + " \tthe file for the inflectional patterns to be generated, default: inflectional.xml");
- System.err.println(DICTIONARY_FILE + " \tthe file for the lexical dictionary to be generated, default: dictionary.lst");
- System.err.println(ADD_EXTRA_GRAMMEMES + " \tFile containing words with the extra grammemes to be added, provide path relative to tools/dictionary-parser/src/main/resources/org/unicode/wikidata/ (only to be used for a temporary grammeme addition)");
- System.err.println(INFLECTION_TYPES + " pos1[,pos2,...]\tthe pos's to be inflected, default: noun");
- System.err.println(IGNORE_GRAMMEMES_FOR_TYPES + " pos1[,pos2,...]\tthe part of speeches for which we don't want to include any grammeme info other than vowel/consonant start, default: (NONE)");
- System.err.println(MAP_GRAMMEME + " grammeme1,grammeme2\twhen grammeme1 is seen in the source dictionary, use grammeme2 instead of it");
- System.err.println(IGNORE_PROPERTY + " grammeme1[,grammeme2,...]\teach property is considered to be an ignorable property.");
- System.err.println(IGNORE_SURFACE_FORM + " type1[,type2,...]\tignore entries with specified grammemes. Default: do not ignore");
- System.err.println(IGNORE_UNANNOTATED_SURFACE_FORM + " \tignore entries without any grammeme annotation. Default: do not ignore");
- System.err.println(INCLUDE_LEMMAS_WITHOUT_WORD + "\tinclude lemma entries which do not have corresponding word-entry. Default: do not include");
- System.err.println(TIMESTAMP + "\ttimestamp of the latest lexicon used. Default: NONE");
- System.err.println(LANGUAGE_OPT + "\tComma separated list of languages to extract to the lexical dictionary. Default: " + ULocale.ENGLISH.getName());
- System.err.println(ADD_NORMALIZED_ENTRY + "\tAdds the normalized entry of a dictionary as an additional dictionary entry, only applies for non lowercase entries. Default: false");
- System.err.println(ADD_DEFAULT_GRAMMEME_FOR_CATEGORY + "\t[pos=partofSpeech1]category1=grammeme1[,category2=grammeme2.....]\t For each of the provided categories if no grammeme is present then add the default grammeme provided for that category to the word. Only applies for the provided parts of speech if pos= is supplied Default: (NONE)");
- System.err.println(IGNORE_UNSTRUCTURED_ENTRIES + " \tIgnore unstructured entries from the lexicon. Default: false");
- System.err.println(ADD_SOUND + " grammeme1[,grammeme2,...]\tSound properties to check for.");
- }
-
- ParserOptions(String[] args) throws Exception{
- posToBeInflected = EnumSet.of(PartOfSpeech.NOUN);
- posWithoutGrammemes = new TreeSet<>();
- additionalGrammemesDict = new TreeMap<>();
- sourceFilenames = new ArrayList<>();
- defaultGrammemeForCategory = new TreeMap<>();
- claimsToSound = new TreeMap<>();
-
- for (int i = 0; i < args.length; i++) {
- String arg = args[i];
- if (ParserOptions.INFLECTIONS_FILE.equals(arg)) {
- inflectionalFilename = args[++i];
- } else if (ParserOptions.DICTIONARY_FILE.equals(arg)) {
- lexicalDictionaryFilename = args[++i];
- } else if (ParserOptions.ADD_EXTRA_GRAMMEMES.equals(arg)) {
- String additionalGrammemeFilename = args[++i];
- String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + additionalGrammemeFilename).toAbsolutePath().toString();
- try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8))) {
- String line;
- while ((line = br.readLine()) != null) {
- int colonIdx = line.indexOf(COLON_SEPARATOR);
- String phrase = line.substring(0, colonIdx);
- String grammemes = line.substring(colonIdx + 1).trim();
- additionalGrammemesDict.put(phrase, new TreeSet<>(Arrays.asList(grammemes.split(" "))));
- }
- optionsUsedToInvoke.add(ParserOptions.ADD_EXTRA_GRAMMEMES);
- optionsUsedToInvoke.add(additionalGrammemeFilename);
- }
- } else if (ParserOptions.MAP_GRAMMEME.equals(arg)) {
- String mapGrammeme = args[++i];
- String[] split = mapGrammeme.split(",", 2);
- REMAP.put(split[0], split[1]);
-
- optionsUsedToInvoke.add(ParserOptions.MAP_GRAMMEME);
- optionsUsedToInvoke.add(mapGrammeme);
- } else if (ParserOptions.IGNORE_PROPERTY.equals(arg)) {
- String propertySetToIgnore = args[++i];
- setIgnoreProperty(propertySetToIgnore.split(","), Ignorable.IGNORABLE_PROPERTY);
- optionsUsedToInvoke.add(ParserOptions.IGNORE_PROPERTY);
- optionsUsedToInvoke.add(propertySetToIgnore);
- } else if (ParserOptions.INFLECTION_TYPES.equals(arg)) {
- String inflectionTypes = args[++i];
- posToBeInflected.clear();
-
- for (String pos : inflectionTypes.split(",")) {
- posToBeInflected.add(PartOfSpeech.valueOf(pos.toUpperCase()));
- }
-
- optionsUsedToInvoke.add(ParserOptions.INFLECTION_TYPES);
- optionsUsedToInvoke.add(inflectionTypes);
- } else if (ParserOptions.IGNORE_GRAMMEMES_FOR_TYPES.equals(arg)) {
- String ignoredGrammemeTypes = args[++i];
-
- posWithoutGrammemes.clear();
- posWithoutGrammemes.addAll(Arrays.asList(ignoredGrammemeTypes.split(",")));
-
- optionsUsedToInvoke.add(ParserOptions.IGNORE_GRAMMEMES_FOR_TYPES);
- optionsUsedToInvoke.add(ignoredGrammemeTypes);
- } else if (ParserOptions.INCLUDE_LEMMAS_WITHOUT_WORD.equals(arg)) {
- includeLemmasWithoutWords = true;
- optionsUsedToInvoke.add(ParserOptions.INCLUDE_LEMMAS_WITHOUT_WORD);
- } else if (ParserOptions.IGNORE_SURFACE_FORM.equals(arg)) {
- String ignoreEntriesWithGrammemesStr = args[++i];
- setIgnoreProperty(ignoreEntriesWithGrammemesStr.split(","), Ignorable.IGNORABLE_INFLECTION);
- optionsUsedToInvoke.add(ParserOptions.IGNORE_SURFACE_FORM);
- optionsUsedToInvoke.add(ignoreEntriesWithGrammemesStr);
- } else if (ParserOptions.IGNORE_UNANNOTATED_SURFACE_FORM.equals(arg)) {
- ignoreUnannotated = true;
- optionsUsedToInvoke.add(ParserOptions.IGNORE_UNANNOTATED_SURFACE_FORM);
- } else if (ParserOptions.TIMESTAMP.equals(arg)) {
- String timestamp = args[++i];
- optionsUsedToInvoke.add(ParserOptions.TIMESTAMP);
- optionsUsedToInvoke.add(timestamp);
- } else if (ParserOptions.LANGUAGE_OPT.equals(arg)) {
- String localeStr = args[++i];
- locales.clear();
- locales.addAll(List.of(localeStr.split(",")));
- optionsUsedToInvoke.add(ParserOptions.LANGUAGE_OPT);
- optionsUsedToInvoke.add(localeStr);
- } else if (ParserOptions.ADD_NORMALIZED_ENTRY.equals(arg)) {
- addNormalizedEntry = true;
- optionsUsedToInvoke.add(ParserOptions.ADD_NORMALIZED_ENTRY);
- } else if (ParserOptions.ADD_DEFAULT_GRAMMEME_FOR_CATEGORY.equals(arg)) {
- String categoryDefaultGrammemeString = args[++i];
- String[] tokens = categoryDefaultGrammemeString.split(",");
- String posValue = "";
- for (int idx = 0; idx < tokens.length; idx += 1) {
- String token = tokens[idx];
- String[] tokenArgs = token.split("=");
- if (tokenArgs.length != 2) {
- throw new IllegalArgumentException("Default Grammeme for category string does not have entry in the format a=b " + token);
- }
- String key = tokenArgs[0].toLowerCase();
- String value = tokenArgs[1].toLowerCase();
- if (key.compareTo("pos") == 0) {
- if (idx != 0) {
- throw new IllegalArgumentException("pos key is not the first argument for default Grammeme for category string " + categoryDefaultGrammemeString);
- }
- posValue = value;
- continue;
- }
- defaultGrammemeForCategory.putIfAbsent(posValue, new TreeMap<>());
- defaultGrammemeForCategory.get(posValue).put(key, value);
- }
-
- optionsUsedToInvoke.add(ParserOptions.ADD_DEFAULT_GRAMMEME_FOR_CATEGORY);
- optionsUsedToInvoke.add(categoryDefaultGrammemeString);
- } else if (ParserOptions.IGNORE_UNSTRUCTURED_ENTRIES.equals(arg)) {
- ignoreUnstructuredEntries = true;
- optionsUsedToInvoke.add(ParserOptions.IGNORE_UNSTRUCTURED_ENTRIES);
- } else if (ParserOptions.ADD_SOUND.equals(arg)) {
- String soundGrammemeTypes = args[++i];
-
- List additionalSoundProperties = Arrays.asList(soundGrammemeTypes.split(","));
-
- for (String claimID : ParseWikidata.PROPERTIES_WITH_PRONUNCIATION) {
- Properties soundRegexes = new Properties();
- String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + claimID + ".properties").toAbsolutePath().toString();
- try (var propertiesStream = new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8)) {
- soundRegexes.load(propertiesStream);
- var enumMap = new EnumMap(Sound.class);
- for (var entry : soundRegexes.entrySet()) {
- var key = (String) entry.getKey();
- if (additionalSoundProperties.contains(key)) {
- enumMap.put(Sound.valueOf(key.toUpperCase(Locale.ROOT).replace('-', '_')), Pattern.compile((String)entry.getValue()));
- }
- }
- if (enumMap.size() != additionalSoundProperties.size()) {
- throw new IllegalArgumentException("Not all sound properties were found");
- }
- claimsToSound.put(claimID, enumMap);
- }
- }
-
- optionsUsedToInvoke.add(ParserOptions.ADD_SOUND);
- optionsUsedToInvoke.add(soundGrammemeTypes);
- } else {
- sourceFilenames.add(arg);
- }
- }
-
- addSound = !claimsToSound.isEmpty();
-
- if (sourceFilenames.isEmpty()) {
- printUsage();
- throw new IllegalArgumentException();
- }
- }
-
- void setIgnoreProperty(String[] grammemes, Ignorable ignorable) {
- var ignorableSet = EnumSet.of(ignorable);
- for (String grammeme : grammemes) {
- if (grammeme.matches("Q\\d*")) {
- TYPEMAP.put(grammeme, ignorableSet);
- }
- else {
- for (Map.Entry>> entry : TYPEMAP.entrySet()) {
- for (var grammemeEnum : entry.getValue()) {
- String name = grammemeEnum.name();
- if (name.equalsIgnoreCase(grammeme)) {
- if (entry.getValue().size() == 1) {
- entry.setValue(ignorableSet);
- }
- else {
- entry.getValue().remove(grammemeEnum);
- ArrayList> clone = new ArrayList<>(entry.getValue());
- clone.add(ignorable);
- entry.setValue(new HashSet<>(clone));
- }
- break;
- }
- }
- }
- }
- }
- }
-}
-
-/**
- * Contains statistical information on what has been analyzed.
- */
-final class DocumentState {
- int lemmaCount = 0;
- int unusableLemmaCount = 0;
- int unusableSurfaceFormCount = 0;
- int mergedCount = 0;
- int incomingSurfaceForm = 0;
- TreeMap dictionary = new TreeMap<>();
- ArrayList inflectionPatterns = new ArrayList<>(1024);
-
- boolean isInflectional() {
- return inflectionPatterns.size() > 1 || (inflectionPatterns.size() == 1 && inflectionPatterns.get(0).getCount() > 1);
- }
-
- private void sortInflectionPatterns(ArrayList inflectionPatterns) {
- // We are sorting for the common ones first and then compare the identifier for lack of a better
- inflectionPatterns.sort(Comparator
- .comparing(InflectionPattern::getCount)
- .reversed()
- .thenComparing(InflectionPattern::getID));
- int identifierEnumeration = 1;
- for (InflectionPattern inflectionPattern : inflectionPatterns) {
- inflectionPattern.setID(identifierEnumeration++); // This is where we are reassigning identifiers to their new values.
- }
- }
-
- public void addDictionaryEntry(DictionaryEntry dictionaryEntry){
- String phrase = dictionaryEntry.phrase;
- DictionaryEntry existingDictionaryEntry = dictionary.get(phrase);
- if (existingDictionaryEntry == null) {
- dictionary.put(phrase, dictionaryEntry);
- }else{
- mergedCount++;
- existingDictionaryEntry.merge(dictionaryEntry);
- }
- }
-
- public void printDocument(ParserOptions parserOptions, long startTime) throws FileNotFoundException {
- TreeMap, Integer> grammemeCounts = new TreeMap<>(EnumComparator.ENUM_COMPARATOR);
- int unclassifiedTerms = 0;
- if (isInflectional()) {
- try (PrintWriter inflectionalStream = new PrintWriter(new OutputStreamWriter(
- new FileOutputStream(parserOptions.inflectionalFilename), StandardCharsets.UTF_8))) {
- inflectionalStream.println("\n" +
- "");
-
- sortInflectionPatterns(inflectionPatterns);
- for (InflectionPattern inflectionPattern : inflectionPatterns) {
- inflectionalStream.print(inflectionPattern);
- }
- inflectionalStream.println("");
- }
- }
- try (PrintWriter lexicalDictionaryStream = new PrintWriter(new OutputStreamWriter(
- new FileOutputStream(parserOptions.lexicalDictionaryFilename), StandardCharsets.UTF_8))) {
- for (Map.Entry entry : dictionary.entrySet()) {
- DictionaryEntry dictionaryEntry = entry.getValue();
- if (dictionaryEntry.getGrammemes().isEmpty()) {
- // We don't care about only known words. We need grammeme data
- unclassifiedTerms++;
- continue;
- }
- // Print the dictionary entry to the .lst file.
- lexicalDictionaryStream.println(dictionaryEntry.toString(isInflectional()));
- for (Enum> grammeme : dictionaryEntry.getGrammemes()) {
- grammemeCounts.merge(grammeme, 1, Integer::sum);
- }
- }
-
- NumberFormat percentFormat = NumberFormat.getPercentInstance(Locale.US);
- percentFormat.setMaximumFractionDigits(1);
- int dictionarySize = dictionary.size();
- StringBuilder source = new StringBuilder();
- Pattern anythingSlash = Pattern.compile(".*/");
- for (String sourceFilename : parserOptions.sourceFilenames) {
- source.append(anythingSlash.matcher(sourceFilename).replaceAll("")).append(" ");
- }
- lexicalDictionaryStream.println("==============================================");
- lexicalDictionaryStream.printf("%30s %7s%n", "Source:", source);
- lexicalDictionaryStream.printf("%30s %7d%n", "Lemma terms:", lemmaCount);
- lexicalDictionaryStream.printf("%30s %7d%n", "Unusable lemma terms:", unusableLemmaCount);
- lexicalDictionaryStream.printf("%30s %7d%n", "Incoming surface forms:", incomingSurfaceForm);
- lexicalDictionaryStream.printf("%30s %7d%n", "Surface forms:", dictionarySize);
- lexicalDictionaryStream.printf("%30s %7d %7s%n", "Collapsed surface forms:", mergedCount, '(' + percentFormat.format((mergedCount) / (double) incomingSurfaceForm) + ')');
- lexicalDictionaryStream.printf("%30s %7d%n", "Unusable surface forms:", unusableSurfaceFormCount);
- lexicalDictionaryStream.printf("%30s %7d %7s%n", "Usable terms:", dictionarySize - unclassifiedTerms, '(' + percentFormat.format((dictionarySize - unclassifiedTerms) / (double) dictionarySize) + ')');
- lexicalDictionaryStream.printf("%30s %7d %7s%n", "Unclassified terms:", unclassifiedTerms, '(' + percentFormat.format(unclassifiedTerms / (double) dictionarySize) + ')');
- lexicalDictionaryStream.println("==============================================");
- TreeMap>> categories = new TreeMap<>();
- for (var entry : grammemeCounts.entrySet()) {
- var entryCategory = entry.getKey().getClass().getSimpleName();
- if (!categories.containsKey(entryCategory)) {
- categories.put(entryCategory, new ArrayList<>());
- }
- var categoryValues = categories.get(entryCategory);
- categoryValues.add(entry.getKey());
- }
-
- for (var categoryEntry : categories.entrySet()) {
- var categoryName = categoryEntry.getKey();
- lexicalDictionaryStream.printf("%s:%n", categoryName);
- var categoryValues = categoryEntry.getValue();
- categoryValues.sort(Comparator.comparing(grammemeCounts::get));
- Collections.reverse(categoryValues);
- for (var categoryValue : categoryValues) {
- lexicalDictionaryStream.printf(" %-20s %7d %7s%n", categoryValue.toString() + ':', grammemeCounts.get(categoryValue), '(' + percentFormat.format(grammemeCounts.get(categoryValue) / (double) dictionarySize) + ')');
- }
- lexicalDictionaryStream.printf("%n");
- }
- long endTime = System.currentTimeMillis();
- long elapsedTime = (endTime-startTime);
- lexicalDictionaryStream.println("processed in " + (elapsedTime / 1000) + '.' + (elapsedTime % 1000) + " seconds");
- lexicalDictionaryStream.println("License: Creative Commons CC0 License (https://creativecommons.org/publicdomain/zero/1.0/)");
- lexicalDictionaryStream.println("generated with options: " + String.join(" ", parserOptions.optionsUsedToInvoke));
- }
- }
-
- DocumentState() {
- }
-}
-
/**
* @see https://dumps.wikimedia.org/wikidatawiki/entities/
*/
@@ -436,6 +47,7 @@ public final class ParseWikidata {
));
static final Set PROPERTIES_WITH_GRAMMEMES = new TreeSet<>(List.of(
"P31", // instance of. Sometimes phrase information is here.
+ "P1552", // has characteristic for animacy
"P5185" // grammatical gender
));
static final Set IMPORTANT_PROPERTIES = new TreeSet<>(PROPERTIES_WITH_GRAMMEMES);
@@ -462,22 +74,30 @@ private Lemma() {}
}
private final ParserOptions parserOptions;
- private final DocumentState documentState;
+ private final DocumentState documentState = new DocumentState();
+ private final TreeSet rareLemmas = new TreeSet<>();
+ private final TreeSet omitLemmas = new TreeSet<>();
ParseWikidata(ParserOptions parserOptions)
{
this.parserOptions = parserOptions;
- this.documentState = new DocumentState();
- }
-
- private void addGrammeme(TreeSet> grammemes, @Nullable String grammeme) {
- if (grammeme != null && !grammeme.isEmpty()) {
- Set extends Enum>> values = Grammar.getMappedGrammemes(grammeme);
- if (values == null) {
- throw new RuntimeException(grammeme + " is not a known grammeme");
+ for (var language : parserOptions.locales) {
+ Properties rareLemmasProperties = new Properties();
+ String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + "filter_" + language + ".properties").toAbsolutePath().toString();
+ try (var propertiesStream = new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8)) {
+ rareLemmasProperties.load(propertiesStream);
+ for (var entry : rareLemmasProperties.entrySet()) {
+ String key = entry.getKey().toString();
+ String value = entry.getValue().toString();
+ switch (value) {
+ case "rare" : rareLemmas.add(key); break;
+ case "omit" : omitLemmas.add(key); break;
+ default: throw new IllegalArgumentException(key + ": Unknown key value " + value);
+ }
+ }
}
- else if (!values.contains(Ignorable.IGNORABLE_PROPERTY)) {
- grammemes.addAll(values);
+ catch (IOException e) {
+ // else oh well. It doesn't matter.
}
}
}
@@ -485,6 +105,10 @@ else if (!values.contains(Ignorable.IGNORABLE_PROPERTY)) {
static final String VARIANT_SEPARATOR = "-x-";
private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
+ if (omitLemmas.contains(lexeme.id)) {
+ // We really don't want this junk.
+ return;
+ }
Lemma lemma = new Lemma();
Set extends Enum>> partOfSpeechSet = null;
for (var lemmaEntry : lexeme.lemmas.entrySet()) {
@@ -514,11 +138,15 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
}
lemma.grammemes.addAll(variant);
}
+ if (rareLemmas.contains(lexeme.id)) {
+ lemma.grammemes.add(Grammar.Usage.RARE);
+ }
extractImportantProperties(lexeme.claims, lemma.grammemes, lexeme.id, lemma.value);
if (lemma.grammemes.contains(Ignorable.IGNORABLE_LEMMA) || lemma.grammemes.contains(Ignorable.IGNORABLE_INFLECTION)) {
documentState.unusableLemmaCount++;
continue;
}
+ lemma.grammemes.remove(Ignorable.IGNORABLE_PROPERTY);
for (var form : lexeme.forms) {
Inflection currentInflection = null;
var representation = form.representations.get(currentLemmaLanguage);
@@ -553,18 +181,32 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
currentInflection.grammemeSet.remove(Grammar.Usage.RARE);
}
currentInflection.grammemeSet.remove(Ignorable.IGNORABLE_PROPERTY);
- lemma.inflections.add(currentInflection);
+ var grammemeExpansion = parserOptions.expandGramemes != null ? parserOptions.expandGramemes.get(currentInflection.grammemeSet) : null;
if (parserOptions.addSound && form.claims != null && !form.claims.isEmpty() && currentInflection.inflection.charAt(0) == lemma.value.charAt(0)) {
// We have potential data, and the words aren't mixed together. So this is probably accurate.
addSound(form.claims, currentInflection.grammemeSet, lexeme.id, lemma.value);
}
+ if (grammemeExpansion == null) {
+ lemma.inflections.add(currentInflection);
+ }
+ else {
+ for (var grammemeSet : grammemeExpansion) {
+ var expandedInflection = new Inflection(currentInflection.inflection, currentInflection.rareUsage);
+ expandedInflection.grammemeSet.addAll(currentInflection.grammemeSet);
+ expandedInflection.grammemeSet.addAll(grammemeSet);
+ lemma.inflections.add(expandedInflection);
+ }
+ }
}
documentState.incomingSurfaceForm += lemma.inflections.size();
lemma.isRare = lemma.grammemes.contains(Grammar.Usage.RARE);
if (lemma.isRare) {
lemma.grammemes.remove(Grammar.Usage.RARE);
}
- lemma.grammemes.remove(Ignorable.IGNORABLE_PROPERTY);
+ if (lemma.inflections.isEmpty()) {
+ documentState.unusableLemmaCount++;
+ return;
+ }
analyzeLemma(lemma);
}
}
@@ -761,18 +403,11 @@ private void analyzeInflections(Lemma lemma, List inputInflections)
}
// else ignore this unimportant inflection pattern. This is usually trimmed for size.
}
- Locale currLocale = Locale.forLanguageTag(parserOptions.locales.get(0));
for (int i = 0; i < inflections.size() ; i++) {
var inflection = inflections.get(i);
String phrase = inflection.getInflection();
InflectionPattern inflectionPatternForDict = nonEmptyInflectionIndices.contains(i) ? inflectionPattern : null;
documentState.addDictionaryEntry(new DictionaryEntry(phrase, phrase, lemma.isRare, inflection.getGrammemeSet(), inflectionPatternForDict));
- if (parserOptions.addNormalizedEntry) {
- String normalizedPhrase = phrase.toLowerCase(currLocale); // locale is specified in the options, by default we use en_US
- if (!normalizedPhrase.equals(phrase) && !lemma.isRare) {
- documentState.addDictionaryEntry(new DictionaryEntry(normalizedPhrase, phrase, false, inflection.getGrammemeSet(), inflectionPatternForDict));
- }
- }
}
}
@@ -812,6 +447,18 @@ private List enumerateInflectionsForGrammemeCombinations(Inflection
return resultInflections;
}
+ private void addGrammeme(TreeSet> grammemes, @Nullable String grammeme) {
+ if (grammeme != null && !grammeme.isEmpty()) {
+ Enum> value = Grammar.DEFAULTMAP.get(grammeme);
+ if (value == null) {
+ throw new NullPointerException(grammeme + " is not a known grammeme");
+ }
+ else if (!value.equals(Ignorable.IGNORABLE_PROPERTY)) {
+ grammemes.add(value);
+ }
+ }
+ }
+
private void mergeAdditionalGrammemes() {
// Add any entries that are missing. The actual properties will be added elsewhere.
TreeSet> grammemes = new TreeSet<>(EnumComparator.ENUM_COMPARATOR);
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserDefaults.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserDefaults.java
new file mode 100644
index 00000000..1d43e76f
--- /dev/null
+++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserDefaults.java
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2025 Unicode Incorporated and others. All rights reserved.
+ * Copyright 2020-2024 Apple Inc. All rights reserved.
+ */
+package org.unicode.wikidata;
+
+import java.util.Comparator;
+
+/**
+ * Default parser option values.
+ */
+final class ParserDefaults {
+ static final String RESOURCES_DIR = "src/main/resources/org/unicode/wikidata/";
+ static final String DEFAULT_INFLECTION_FILE_NAME = "inflectional.xml";
+ static final String DEFAULT_DICTIONARY_FILE_NAME = "dictionary.lst";
+ // Put the rare inflections at the end.
+ static final Comparator RARITY_AWARE_COMPARATOR = Comparator
+ .comparing(Inflection::isRareUsage)
+ .thenComparing(Inflection::compareTo);
+
+ private ParserDefaults() {
+ }
+}
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserOptions.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserOptions.java
new file mode 100644
index 00000000..3f557c23
--- /dev/null
+++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParserOptions.java
@@ -0,0 +1,267 @@
+/*
+ * Copyright 2025 Unicode Incorporated and others. All rights reserved.
+ * Copyright 2020-2024 Apple Inc. All rights reserved.
+ */
+package org.unicode.wikidata;
+
+import com.ibm.icu.util.ULocale;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.EnumMap;
+import java.util.EnumSet;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.regex.Pattern;
+
+import static org.unicode.wikidata.Grammar.DEFAULTMAP;
+import static org.unicode.wikidata.Grammar.TYPEMAP;
+
+/**
+ * The options to extract the data from the data source.
+ */
+final class ParserOptions {
+ private static final char COLON_SEPARATOR = ':';
+ static final String INFLECTIONS_FILE = "--inflections";
+ static final String DICTIONARY_FILE = "--dictionary";
+ static final String MAP_GRAMMEME = "--map-grammeme";
+ static final String ADD_EXTRA_GRAMMEMES = "--add-extra-grammemes";
+ static final String EXPAND_GRAMMEMES = "--expand-grammemes";
+ static final String INFLECTION_TYPES = "--inflection-types";
+ static final String IGNORE_PROPERTY = "--ignore-property";
+ static final String INCLUDE_LEMMAS_WITHOUT_WORD = "--include-lemmas-without-words";
+ static final String IGNORE_SURFACE_FORM = "--ignore-entries-with-grammemes";
+ static final String LANGUAGE_OPT = "--language";
+ static final String TIMESTAMP = "--timestamp";
+ static final String ADD_DEFAULT_GRAMMEME_FOR_CATEGORY = "--add-default-grammeme-for-category";
+ static final String ADD_SOUND = "--add-sound";
+
+ boolean includeLemmasWithoutWords = false;
+ boolean debug = false;
+ final boolean addSound;
+
+ EnumSet posToBeInflected;
+ TreeMap>, List>>> expandGramemes;
+ TreeMap> additionalGrammemesDict;
+ TreeMap> defaultGrammemeForCategory;
+ TreeMap> claimsToSound;
+
+ ArrayList sourceFilenames;
+ String inflectionalFilename = ParserDefaults.DEFAULT_INFLECTION_FILE_NAME;
+ String lexicalDictionaryFilename = ParserDefaults.DEFAULT_DICTIONARY_FILE_NAME;
+ ArrayList locales = new ArrayList<>(List.of(Locale.ENGLISH.getLanguage()));
+ List optionsUsedToInvoke = new ArrayList<>();
+
+ private static void printUsage() {
+ System.err.println("Usage: ParseLexicon [OPTIONS] [ ...]");
+ System.err.println("\nOPTIONS");
+ System.err.println(INFLECTIONS_FILE + " \tthe file for the inflectional patterns to be generated, default: inflectional.xml");
+ System.err.println(DICTIONARY_FILE + " \tthe file for the lexical dictionary to be generated, default: dictionary.lst");
+ System.err.println(ADD_EXTRA_GRAMMEMES + " \tFile containing words with the extra grammemes to be added, provide path relative to tools/dictionary-parser/src/main/resources/org/unicode/wikidata/ (only to be used for a temporary grammeme addition)");
+ System.err.println(EXPAND_GRAMMEMES + " grammeme1,grammeme2...:grammeme3,grammeme4...\tWhen the first set of grammemes are matched, add the additional set of grammemes.");
+ System.err.println(INFLECTION_TYPES + " pos1[,pos2,...]\tthe pos's to be inflected, default: noun");
+ System.err.println(MAP_GRAMMEME + " grammeme1,grammeme2\twhen grammeme1 is seen in the source dictionary, use grammeme2 instead of it");
+ System.err.println(IGNORE_PROPERTY + " grammeme1[,grammeme2,...]\teach property is considered to be an ignorable property.");
+ System.err.println(IGNORE_SURFACE_FORM + " type1[,type2,...]\tignore entries with specified grammemes. Default: do not ignore");
+ System.err.println(INCLUDE_LEMMAS_WITHOUT_WORD + "\tinclude lemma entries which do not have corresponding word-entry. Default: do not include");
+ System.err.println(TIMESTAMP + "\ttimestamp of the latest lexicon used. Default: NONE");
+ System.err.println(LANGUAGE_OPT + "\tComma separated list of languages to extract to the lexical dictionary. Default: " + ULocale.ENGLISH.getName());
+ System.err.println(ADD_DEFAULT_GRAMMEME_FOR_CATEGORY + "\t[pos=partofSpeech1]category1=grammeme1[,category2=grammeme2.....]\t For each of the provided categories if no grammeme is present then add the default grammeme provided for that category to the word. Only applies for the provided parts of speech if pos= is supplied Default: (NONE)");
+ System.err.println(ADD_SOUND + " grammeme1[,grammeme2,...]\tSound properties to check for.");
+ }
+
+ ParserOptions(String[] args) throws IOException {
+ posToBeInflected = EnumSet.of(Grammar.PartOfSpeech.NOUN);
+ additionalGrammemesDict = new TreeMap<>();
+ sourceFilenames = new ArrayList<>();
+ defaultGrammemeForCategory = new TreeMap<>();
+ claimsToSound = new TreeMap<>();
+
+ for (int i = 0; i < args.length; i++) {
+ String arg = args[i];
+ if (ParserOptions.INFLECTIONS_FILE.equals(arg)) {
+ inflectionalFilename = args[++i];
+ } else if (ParserOptions.DICTIONARY_FILE.equals(arg)) {
+ lexicalDictionaryFilename = args[++i];
+ } else if (ParserOptions.ADD_EXTRA_GRAMMEMES.equals(arg)) {
+ String additionalGrammemeFilename = args[++i];
+ String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + additionalGrammemeFilename).toAbsolutePath().toString();
+ try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = br.readLine()) != null) {
+ int colonIdx = line.indexOf(COLON_SEPARATOR);
+ String phrase = line.substring(0, colonIdx);
+ String grammemes = line.substring(colonIdx + 1).trim();
+ additionalGrammemesDict.put(phrase, new TreeSet<>(Arrays.asList(grammemes.split(" "))));
+ }
+ optionsUsedToInvoke.add(ParserOptions.ADD_EXTRA_GRAMMEMES);
+ optionsUsedToInvoke.add(additionalGrammemeFilename);
+ }
+ } else if (ParserOptions.MAP_GRAMMEME.equals(arg)) {
+ String mapGrammeme = args[++i];
+ String[] split = mapGrammeme.split(",", 2);
+ TYPEMAP.put(split[0], toEnumSet(split[1]));
+
+ optionsUsedToInvoke.add(ParserOptions.MAP_GRAMMEME);
+ optionsUsedToInvoke.add(mapGrammeme);
+ } else if (ParserOptions.IGNORE_PROPERTY.equals(arg)) {
+ String propertySetToIgnore = args[++i];
+ setIgnoreProperty(propertySetToIgnore.split(","), Grammar.Ignorable.IGNORABLE_PROPERTY);
+ optionsUsedToInvoke.add(ParserOptions.IGNORE_PROPERTY);
+ optionsUsedToInvoke.add(propertySetToIgnore);
+ } else if (ParserOptions.EXPAND_GRAMMEMES.equals(arg)) {
+ String mapGrammemes = args[++i];
+ String[] split = mapGrammemes.split(":", 2);
+ var key = toEnumSet(split[0]);
+ var valueArray = new ArrayList<>(List.of(toEnumSet(split[1])));
+ if (expandGramemes == null) {
+ expandGramemes = new TreeMap<>(GrammemeSetComparator.ENUM_COMPARATOR);
+ }
+ expandGramemes.merge(key, valueArray, (oldList, newList) -> {
+ oldList.addAll(newList);
+ return oldList;
+ });
+ optionsUsedToInvoke.add(ParserOptions.EXPAND_GRAMMEMES);
+ optionsUsedToInvoke.add(mapGrammemes);
+ } else if (ParserOptions.INFLECTION_TYPES.equals(arg)) {
+ String inflectionTypes = args[++i];
+ posToBeInflected.clear();
+
+ for (String pos : inflectionTypes.split(",")) {
+ posToBeInflected.add(Grammar.PartOfSpeech.valueOf(pos.toUpperCase()));
+ }
+
+ optionsUsedToInvoke.add(ParserOptions.INFLECTION_TYPES);
+ optionsUsedToInvoke.add(inflectionTypes);
+ } else if (ParserOptions.INCLUDE_LEMMAS_WITHOUT_WORD.equals(arg)) {
+ includeLemmasWithoutWords = true;
+ optionsUsedToInvoke.add(ParserOptions.INCLUDE_LEMMAS_WITHOUT_WORD);
+ } else if (ParserOptions.IGNORE_SURFACE_FORM.equals(arg)) {
+ String ignoreEntriesWithGrammemesStr = args[++i];
+ setIgnoreProperty(ignoreEntriesWithGrammemesStr.split(","), Grammar.Ignorable.IGNORABLE_INFLECTION);
+ optionsUsedToInvoke.add(ParserOptions.IGNORE_SURFACE_FORM);
+ optionsUsedToInvoke.add(ignoreEntriesWithGrammemesStr);
+ } else if (ParserOptions.TIMESTAMP.equals(arg)) {
+ String timestamp = args[++i];
+ optionsUsedToInvoke.add(ParserOptions.TIMESTAMP);
+ optionsUsedToInvoke.add(timestamp);
+ } else if (ParserOptions.LANGUAGE_OPT.equals(arg)) {
+ String localeStr = args[++i];
+ locales.clear();
+ locales.addAll(List.of(localeStr.split(",")));
+ optionsUsedToInvoke.add(ParserOptions.LANGUAGE_OPT);
+ optionsUsedToInvoke.add(localeStr);
+ } else if (ParserOptions.ADD_DEFAULT_GRAMMEME_FOR_CATEGORY.equals(arg)) {
+ String categoryDefaultGrammemeString = args[++i];
+ String[] tokens = categoryDefaultGrammemeString.split(",");
+ String posValue = "";
+ for (int idx = 0; idx < tokens.length; idx += 1) {
+ String token = tokens[idx];
+ String[] tokenArgs = token.split("=");
+ if (tokenArgs.length != 2) {
+ throw new IllegalArgumentException("Default Grammeme for category string does not have entry in the format a=b " + token);
+ }
+ String key = tokenArgs[0].toLowerCase();
+ String value = tokenArgs[1].toLowerCase();
+ if (key.compareTo("pos") == 0) {
+ if (idx != 0) {
+ throw new IllegalArgumentException("pos key is not the first argument for default Grammeme for category string " + categoryDefaultGrammemeString);
+ }
+ posValue = value;
+ continue;
+ }
+ defaultGrammemeForCategory.putIfAbsent(posValue, new TreeMap<>());
+ defaultGrammemeForCategory.get(posValue).put(key, value);
+ }
+
+ optionsUsedToInvoke.add(ParserOptions.ADD_DEFAULT_GRAMMEME_FOR_CATEGORY);
+ optionsUsedToInvoke.add(categoryDefaultGrammemeString);
+ } else if (ParserOptions.ADD_SOUND.equals(arg)) {
+ String soundGrammemeTypes = args[++i];
+
+ List additionalSoundProperties = Arrays.asList(soundGrammemeTypes.split(","));
+
+ for (String claimID : ParseWikidata.PROPERTIES_WITH_PRONUNCIATION) {
+ Properties soundRegexes = new Properties();
+ String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + claimID + ".properties").toAbsolutePath().toString();
+ try (var propertiesStream = new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8)) {
+ soundRegexes.load(propertiesStream);
+ var enumMap = new EnumMap(Grammar.Sound.class);
+ for (var entry : soundRegexes.entrySet()) {
+ var key = (String) entry.getKey();
+ if (additionalSoundProperties.contains(key)) {
+ enumMap.put(Grammar.Sound.valueOf(key.toUpperCase(Locale.ROOT).replace('-', '_')), Pattern.compile((String) entry.getValue()));
+ }
+ }
+ if (enumMap.size() != additionalSoundProperties.size()) {
+ throw new IllegalArgumentException("Not all sound properties were found");
+ }
+ claimsToSound.put(claimID, enumMap);
+ }
+ }
+
+ optionsUsedToInvoke.add(ParserOptions.ADD_SOUND);
+ optionsUsedToInvoke.add(soundGrammemeTypes);
+ } else {
+ sourceFilenames.add(arg);
+ }
+ }
+
+ addSound = !claimsToSound.isEmpty();
+
+ if (sourceFilenames.isEmpty()) {
+ printUsage();
+ throw new IllegalArgumentException();
+ }
+ }
+
+ void setIgnoreProperty(String[] grammemes, Grammar.Ignorable ignorable) {
+ var ignorableSet = EnumSet.of(ignorable);
+ for (String grammeme : grammemes) {
+ if (grammeme.matches("Q\\d*")) {
+ TYPEMAP.put(grammeme, ignorableSet);
+ } else {
+ for (Map.Entry>> entry : TYPEMAP.entrySet()) {
+ for (var grammemeEnum : entry.getValue()) {
+ String name = grammemeEnum.name();
+ if (name.equalsIgnoreCase(grammeme)) {
+ if (entry.getValue().size() == 1) {
+ entry.setValue(ignorableSet);
+ } else {
+ entry.getValue().remove(grammemeEnum);
+ ArrayList> clone = new ArrayList<>(entry.getValue());
+ clone.add(ignorable);
+ entry.setValue(new HashSet<>(clone));
+ }
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ TreeSet> toEnumSet(String grammemes) {
+ TreeSet> grammemeSet = new TreeSet<>(Inflection.ENUM_COMPARATOR);
+ for (var grammeme : grammemes.split(",")) {
+ var grammemeEnum = DEFAULTMAP.get(grammeme);
+ if (grammemeEnum == null) {
+ throw new NullPointerException(grammeme + " is not a valid grammeme");
+ }
+ grammemeSet.add(grammemeEnum);
+ }
+ return grammemeSet;
+ }
+}
diff --git a/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_en.properties b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_en.properties
new file mode 100644
index 00000000..20387004
--- /dev/null
+++ b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_en.properties
@@ -0,0 +1,18 @@
+# Copyright 2025 Unicode Incorporated and others. All rights reserved.
+#
+# These are lexemes that should either be ignored due to irrelevance that can't be easily tagged as irrelevant,
+# or words that are just not that common that should be sorted last in the inflection patterns.
+L15388=rare
+L299075=omit
+L342586=omit
+L468896=omit
+L469033=omit
+L469036=omit
+L469037=omit
+L469040=omit
+L469047=omit
+L684798=omit
+L685028=omit
+L685030=omit
+L984169=omit
+L1321935=omit