diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java
index 1f5f270a..ba10c2db 100644
--- a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java
+++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java
@@ -32,16 +32,20 @@
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
+import java.util.HashMap;
import java.util.TreeSet;
-
+import java.util.Arrays;
+import java.util.AbstractMap.SimpleEntry;
import static org.unicode.wikidata.Grammar.Gender;
import static org.unicode.wikidata.Grammar.Ignorable;
import static org.unicode.wikidata.Grammar.PartOfSpeech;
import static org.unicode.wikidata.Grammar.Sound;
/**
- * @see https://dumps.wikimedia.org/wikidatawiki/entities/
+ * @see https://dumps.wikimedia.org/wikidatawiki/entities/
*/
+
public final class ParseWikidata {
static final Set PROPERTIES_WITH_PRONUNCIATION = new TreeSet<>(List.of(
"P898" // IPA transcription
@@ -72,33 +76,51 @@ void reset() {
grammemes.clear();
inflections.clear();
}
- private Lemma() {}
+
+ private Lemma() {
+ }
}
private final ParserOptions parserOptions;
private final DocumentState documentState = new DocumentState();
private final TreeSet rareLemmas = new TreeSet<>();
private final TreeSet omitLemmas = new TreeSet<>();
+ private final Map> mergeMap = new HashMap<>();
+ private final TreeSet deferredLexemes = new TreeSet<>();
+ private final Map> lexemeMap = new HashMap<>();
- ParseWikidata(ParserOptions parserOptions)
- {
+ ParseWikidata(ParserOptions parserOptions) {
this.parserOptions = parserOptions;
for (var language : parserOptions.locales) {
Properties rareLemmasProperties = new Properties();
- String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + "filter_" + language + ".properties").toAbsolutePath().toString();
+ String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + "filter_" + language + ".properties")
+ .toAbsolutePath().toString();
try (var propertiesStream = new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8)) {
rareLemmasProperties.load(propertiesStream);
for (var entry : rareLemmasProperties.entrySet()) {
String key = entry.getKey().toString();
String value = entry.getValue().toString();
- switch (value) {
- case "rare" : rareLemmas.add(key); break;
- case "omit" : omitLemmas.add(key); break;
- default: throw new IllegalArgumentException(key + ": Unknown key value " + value);
+ if (value.matches("L[0-9]+")) {
+ var values = Arrays.asList(value.split(","));
+ mergeMap.computeIfAbsent(key, v -> new ArrayList<>()).addAll(values);
+ deferredLexemes.add(key);
+ deferredLexemes.addAll(values);
+ } else {
+ switch (value) {
+ case "rare": {
+ rareLemmas.add(key);
+ break;
+ }
+ case "omit": {
+ omitLemmas.add(key); break;
+ }
+ default: {
+ throw new IllegalArgumentException(key + ": Unknown key value " + value);
+ }
+ }
}
}
- }
- catch (IOException e) {
+ } catch (IOException e) {
// else oh well. It doesn't matter.
}
}
@@ -111,6 +133,11 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
// We really don't want this junk.
return;
}
+ if (deferredLexemes.contains(lexeme.id)) {
+ deferredLexemes.remove(lexeme.id);
+ lexemeMap.put(lexeme.id, new SimpleEntry<>(lexeme, lineNumber));
+ return;
+ }
Lemma lemma = new Lemma();
Set extends Enum>> partOfSpeechSet = null;
for (var lemmaEntry : lexeme.lemmas.entrySet()) {
@@ -122,7 +149,8 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
if (partOfSpeechSet == null) {
partOfSpeechSet = Grammar.getMappedGrammemes(lexeme.lexicalCategory);
if (partOfSpeechSet == null) {
- throw new IllegalArgumentException(lexeme.lexicalCategory + " is not a known part of speech grammeme for " + lexeme.id + "(" + lemma.value + ")");
+ throw new IllegalArgumentException(lexeme.lexicalCategory
+ + " is not a known part of speech grammeme for " + lexeme.id + "(" + lemma.value + ")");
}
}
lemma.grammemes.addAll(partOfSpeechSet);
@@ -134,7 +162,9 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
var variant = Grammar.getMappedGrammemes(additionalCategory);
if (variant == null) {
if (parserOptions.debug) {
- System.err.println("Line " + lineNumber + ": " + additionalCategory + " is not a known grammeme for the language variant " + lexeme.id + "(" + lemma.value + ")");
+ System.err.println("Line " + lineNumber + ": " + additionalCategory
+ + " is not a known grammeme for the language variant " + lexeme.id + "(" + lemma.value
+ + ")");
}
continue;
}
@@ -144,7 +174,8 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
lemma.grammemes.add(Grammar.Usage.RARE);
}
extractImportantProperties(lexeme.claims, lemma.grammemes, lexeme.id, lemma.value);
- if (lemma.grammemes.contains(Ignorable.IGNORABLE_LEMMA) || lemma.grammemes.contains(Ignorable.IGNORABLE_INFLECTION)) {
+ if (lemma.grammemes.contains(Ignorable.IGNORABLE_LEMMA)
+ || lemma.grammemes.contains(Ignorable.IGNORABLE_INFLECTION)) {
documentState.unusableLemmaCount++;
continue;
}
@@ -155,8 +186,7 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
var representation = form.representations.get(currentLemmaLanguage);
if (representation != null) {
currentInflection = new Inflection(representation.value);
- }
- else {
+ } else {
// Couldn't find an exact match. Go to a generic match.
for (var rep : form.representations.entrySet()) {
if (isContained(currentLemmaLanguage, lemmaEntry.getKey())) {
@@ -184,17 +214,21 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
currentInflection.grammemeSet.remove(Grammar.Usage.RARE);
}
currentInflection.grammemeSet.remove(Ignorable.IGNORABLE_PROPERTY);
- var grammemeExpansion = parserOptions.expandGramemes != null ? parserOptions.expandGramemes.get(currentInflection.grammemeSet) : null;
- if (parserOptions.addSound && form.claims != null && !form.claims.isEmpty() && currentInflection.inflection.charAt(0) == lemma.value.charAt(0)) {
- // We have potential data, and the words aren't mixed together. So this is probably accurate.
+ var grammemeExpansion = parserOptions.expandGramemes != null
+ ? parserOptions.expandGramemes.get(currentInflection.grammemeSet)
+ : null;
+ if (parserOptions.addSound && form.claims != null && !form.claims.isEmpty()
+ && currentInflection.inflection.charAt(0) == lemma.value.charAt(0)) {
+ // We have potential data, and the words aren't mixed together. So this is
+ // probably accurate.
addSound(form.claims, currentInflection.grammemeSet, lexeme.id, lemma.value);
}
if (grammemeExpansion == null) {
lemma.inflections.add(currentInflection);
- }
- else {
+ } else {
for (var grammemeSet : grammemeExpansion) {
- var expandedInflection = new Inflection(currentInflection.inflection, currentInflection.rareUsage);
+ var expandedInflection = new Inflection(currentInflection.inflection,
+ currentInflection.rareUsage);
expandedInflection.grammemeSet.addAll(currentInflection.grammemeSet);
expandedInflection.grammemeSet.addAll(grammemeSet);
lemma.inflections.add(expandedInflection);
@@ -213,11 +247,46 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
analyzeLemma(lemma);
}
}
+ private void moveLexemeClaimsToForms(Lexeme lexeme) {
+ for (LexemeForm form : lexeme.forms) {
+ for (Map.Entry> entry : lexeme.claims.entrySet()) {
+ String key = entry.getKey();
+ if (form.claims == null) {
+ form.claims = new HashMap<>();
+ }
+ form.claims.computeIfAbsent(key, k -> new ArrayList<>()).addAll(entry.getValue());
+ }
+ }
+ lexeme.claims.clear();
+ }
+ private Lexeme mergeLexemes(Lexeme lexeme1, Lexeme lexeme2) {
+ moveLexemeClaimsToForms(lexeme2);
+ lexeme1.forms.addAll(lexeme2.forms); // Combine forms
+ return lexeme1;
+ }
+ // Method to process and merge lexemes
+ private void processAndMergeLexemes() {
+ for (Map.Entry> entry : mergeMap.entrySet()) {
+ SimpleEntry pair = lexemeMap.computeIfAbsent(entry.getKey(), key -> {
+ throw new IllegalArgumentException(key + ": id not found");
+ });
+ Lexeme mergedLexeme = pair.getKey();
+ int lineNumber = pair.getValue();
+ for (var value : entry.getValue()) {
+ mergeLexemes(mergedLexeme, lexemeMap.computeIfAbsent(value, key -> {
+ throw new IllegalArgumentException(key + ": id not found");
+ }).getKey());
+ }
+ analyzeLexeme(lineNumber, mergedLexeme);
+ }
+ }
/**
- * When there are multiple genders at the lemma level, it's a ranking system instead of applying to all forms.
+ * When there are multiple genders at the lemma level, it's a ranking system
+ * instead of applying to all forms.
* Such data is useless. So we should ignore it.
- * When there are multiple genders at the form level, the same form is valid for all specified genders.
+ * When there are multiple genders at the form level, the same form is valid for
+ * all specified genders.
*/
private void removeConflicts(TreeSet> grammemes, Class> grammemeType) {
if (grammemes.size() > 1) {
@@ -252,7 +321,8 @@ private void convertGrammemes(LexemeForm form, Inflection currentInflection, Str
extractImportantProperties(form.claims, currentInflection.grammemeSet, id, lemma);
}
- private void extractImportantProperties(Map> claims, TreeSet> grammemes, String id, String lemma) {
+ private void extractImportantProperties(Map> claims, TreeSet> grammemes, String id,
+ String lemma) {
if (claims == null || claims.isEmpty()) {
return;
}
@@ -263,10 +333,11 @@ private void extractImportantProperties(Map> claims, TreeSe
var grammemeEnum = Grammar.getMappedGrammemes(grammemeStr);
if (grammemeEnum != null) {
grammemes.addAll(grammemeEnum);
- }
- else if (parserOptions.debug) {
- // Most of this is irrelevant non-grammatical information, like that it's a trademark, or a study of something,
- // but sometimes it contains grammemes that apply to all words, like grammatical gender.
+ } else if (parserOptions.debug) {
+ // Most of this is irrelevant non-grammatical information, like that it's a
+ // trademark, or a study of something,
+ // but sometimes it contains grammemes that apply to all words, like grammatical
+ // gender.
System.err.println(grammemeStr + " is not a known grammeme for " + id + "(" + lemma + ")");
}
}
@@ -300,7 +371,8 @@ private static boolean validateStemLength(@Nonnull List inflections,
boolean invalid = false;
for (var inflection_inner : inflections) {
var inflectionInnerStr = inflection_inner.getInflection();
- if (inflectionInnerStr.endsWith(suffix) && ((inflectionInnerStr.length() - suffix.length()) < stemLength)) {
+ if (inflectionInnerStr.endsWith(suffix)
+ && ((inflectionInnerStr.length() - suffix.length()) < stemLength)) {
invalid = true;
break;
}
@@ -312,8 +384,9 @@ private static boolean validateStemLength(@Nonnull List inflections,
return true;
}
- //Provided lemma and all it's surface forms, return the length of the longest common prefix among them
- private static int getStemLength(String lemma, @Nonnull List inflections){
+ // Provided lemma and all it's surface forms, return the length of the longest
+ // common prefix among them
+ private static int getStemLength(String lemma, @Nonnull List inflections) {
String[] stringList = new String[inflections.size() + 1];
for (int i = 0; i < inflections.size(); i++) {
stringList[i] = inflections.get(i).getInflection();
@@ -342,11 +415,13 @@ private static List generateSuffixes(int stemLength, @Nonnull List generateSuffixes(int stemLength, @Nonnull List inflections, EnumSet posToBeInflected) {
+ // Check whether the surface forms to be inflected or not
+ private static boolean containsImportant(@Nonnull List inflections,
+ EnumSet posToBeInflected) {
for (Inflection inflection : inflections) {
if (!Collections.disjoint(posToBeInflected, inflection.getGrammemeSet())) {
return true;
@@ -367,17 +443,18 @@ private static boolean containsImportant(@Nonnull List inflections,
return false;
}
- //Given lemma suffix and surface form suffixes, return either an existing inflection pattern or return a new one while adding to the existing inflection patterns
+ // Given lemma suffix and surface form suffixes, return either an existing
+ // inflection pattern or return a new one while adding to the existing
+ // inflection patterns
private InflectionPattern getInflectionPattern(Lemma lemma, String lemmaSuffix,
- List suffixes) {
+ List suffixes) {
TreeSet> newGrammemeList = new TreeSet<>(lemma.grammemes);
InflectionPattern inflectionPattern = new InflectionPattern(
documentState.inflectionPatterns.size() + 1,
lemmaSuffix,
newGrammemeList,
- suffixes
- );
+ suffixes);
int idx = documentState.inflectionPatterns.indexOf(inflectionPattern);
@@ -403,7 +480,7 @@ private void analyzeInflections(Lemma lemma, List inputInflections)
if (!inflections.isEmpty()) {
ArrayList nonEmptyInflections = new ArrayList<>();
// Adding lemma grammemes to all inflections
- for (int i = 0; i < inflections.size() ; i++) {
+ for (int i = 0; i < inflections.size(); i++) {
var inflection = inflections.get(i);
var inflectionGrammemes = inflection.getGrammemeSet();
if (!inflectionGrammemes.isEmpty() && !InflectionPattern.isIgnorableGrammemeSet(inflectionGrammemes)) {
@@ -412,9 +489,10 @@ private void analyzeInflections(Lemma lemma, List inputInflections)
}
inflectionGrammemes.addAll(lemma.grammemes);
}
- // If all inflections are empty then add all significant inflections to the pattern
+ // If all inflections are empty then add all significant inflections to the
+ // pattern
if (nonEmptyInflectionIndices.isEmpty()) {
- for (int i = 0; i < inflections.size() ; i++) {
+ for (int i = 0; i < inflections.size(); i++) {
var inflection = inflections.get(i);
if (!InflectionPattern.isIgnorableGrammemeSet(inflection.getGrammemeSet())) {
nonEmptyInflections.add(inflection);
@@ -428,16 +506,18 @@ private void analyzeInflections(Lemma lemma, List inputInflections)
List suffixes = generateSuffixes(stemLength, nonEmptyInflections);
inflectionPattern = getInflectionPattern(lemma,
lemma.value.substring(stemLength),
- suffixes
- );
+ suffixes);
}
- // else ignore this unimportant inflection pattern. This is usually trimmed for size.
+ // else ignore this unimportant inflection pattern. This is usually trimmed for
+ // size.
}
- for (int i = 0; i < inflections.size() ; i++) {
+ for (int i = 0; i < inflections.size(); i++) {
var inflection = inflections.get(i);
String phrase = inflection.getInflection();
- InflectionPattern inflectionPatternForDict = nonEmptyInflectionIndices.contains(i) ? inflectionPattern : null;
- documentState.addDictionaryEntry(new DictionaryEntry(phrase, phrase, lemma.isRare, inflection.getGrammemeSet(), inflectionPatternForDict));
+ InflectionPattern inflectionPatternForDict = nonEmptyInflectionIndices.contains(i) ? inflectionPattern
+ : null;
+ documentState.addDictionaryEntry(new DictionaryEntry(phrase, phrase, lemma.isRare,
+ inflection.getGrammemeSet(), inflectionPatternForDict));
}
}
@@ -457,7 +537,7 @@ private List enumerateInflectionsForGrammemeCombinations(Inflection
for (List> list : results) {
list.add(grammeme);
}
- }else {
+ } else {
newResults.clear();
for (List> list : results) {
ArrayList> newList = new ArrayList<>(grammemeSet.size());
@@ -482,15 +562,15 @@ private void addGrammeme(TreeSet> grammemes, @Nullable String grammeme)
Enum> value = Grammar.DEFAULTMAP.get(grammeme);
if (value == null) {
throw new NullPointerException(grammeme + " is not a known grammeme");
- }
- else if (!value.equals(Ignorable.IGNORABLE_PROPERTY)) {
+ } else if (!value.equals(Ignorable.IGNORABLE_PROPERTY)) {
grammemes.add(value);
}
}
}
private void mergeAdditionalGrammemes() {
- // Add any entries that are missing. The actual properties will be added elsewhere.
+ // Add any entries that are missing. The actual properties will be added
+ // elsewhere.
TreeSet> grammemes = new TreeSet<>(EnumComparator.ENUM_COMPARATOR);
for (var entry : parserOptions.additionalGrammemesDict.entrySet()) {
grammemes.clear();
@@ -546,8 +626,10 @@ public static void main(String[] args) throws Exception {
var lexParser = new ParseWikidata(parserOptions);
LexemesJsonDeserializer.setLanguage(parserOptions.locales);
- // We create InputSource directly due to an occasional bugs with UTF-8 files being interpreted as malformed UTF-8.
- // We use a large buffer because we're reading a large file, and we're frequently reading file data.
+ // We create InputSource directly due to an occasional bugs with UTF-8 files
+ // being interpreted as malformed UTF-8.
+ // We use a large buffer because we're reading a large file, and we're
+ // frequently reading file data.
for (String sourceFilename : parserOptions.sourceFilenames) {
try (InputStream fileInputStream = new FileInputStream(sourceFilename)) {
InputStream inputStream = fileInputStream;
@@ -567,10 +649,11 @@ public static void main(String[] args) throws Exception {
lexParser.analyzeLexeme(parser.currentLocation().getLineNr(), lexeme);
} catch (IllegalArgumentException e) {
lexParser.documentState.unusableLemmaCount++;
- System.err.println("Line " + parser.currentLocation().getLineNr() + ": " + e.getMessage());
+ System.err.println(
+ "Line " + parser.currentLocation().getLineNr() + ": " + e.getMessage());
}
- }
- while (parser.nextToken() != JsonToken.END_ARRAY);
+ } while (parser.nextToken() != JsonToken.END_ARRAY);
+ lexParser.processAndMergeLexemes();
}
}
}
@@ -586,4 +669,3 @@ public static void main(String[] args) throws Exception {
}
}
}
-
diff --git a/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_de.properties b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_de.properties
new file mode 100644
index 00000000..77a496e4
--- /dev/null
+++ b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_de.properties
@@ -0,0 +1,20 @@
+# Copyright 2025 Unicode Incorporated and others. All rights reserved.
+
+#organisator = organisatorin
+#Eigentümer = Eigentümerin
+# Autor = Autorin
+#Teilnehmer = Teilnehmerin
+#Freund = Freundin
+#Ehemann = Ehefrau
+#Benutzer = Benutzerin
+#Organspender = Organspenderin
+#Besucher = Besucherin
+L313979=L481883
+L296285=L833806
+L34181=L34182
+L447531=L481654
+L58087=L58088
+L484250=L252570
+L44834=L494386
+L860063=L931664
+L2272=L295129
diff --git a/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_it.properties b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_it.properties
new file mode 100644
index 00000000..7dcc0c06
--- /dev/null
+++ b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_it.properties
@@ -0,0 +1,4 @@
+# Copyright 2025 Unicode Incorporated and others. All rights reserved.
+
+#attore(masculine) = attrice(feminine)
+L1101749=L202915
\ No newline at end of file