diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java index 1f5f270a..ba10c2db 100644 --- a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java +++ b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java @@ -32,16 +32,20 @@ import java.util.Map; import java.util.Set; import java.util.TreeMap; +import java.util.HashMap; import java.util.TreeSet; - +import java.util.Arrays; +import java.util.AbstractMap.SimpleEntry; import static org.unicode.wikidata.Grammar.Gender; import static org.unicode.wikidata.Grammar.Ignorable; import static org.unicode.wikidata.Grammar.PartOfSpeech; import static org.unicode.wikidata.Grammar.Sound; /** - * @see https://dumps.wikimedia.org/wikidatawiki/entities/ + * @see https://dumps.wikimedia.org/wikidatawiki/entities/ */ + public final class ParseWikidata { static final Set PROPERTIES_WITH_PRONUNCIATION = new TreeSet<>(List.of( "P898" // IPA transcription @@ -72,33 +76,51 @@ void reset() { grammemes.clear(); inflections.clear(); } - private Lemma() {} + + private Lemma() { + } } private final ParserOptions parserOptions; private final DocumentState documentState = new DocumentState(); private final TreeSet rareLemmas = new TreeSet<>(); private final TreeSet omitLemmas = new TreeSet<>(); + private final Map> mergeMap = new HashMap<>(); + private final TreeSet deferredLexemes = new TreeSet<>(); + private final Map> lexemeMap = new HashMap<>(); - ParseWikidata(ParserOptions parserOptions) - { + ParseWikidata(ParserOptions parserOptions) { this.parserOptions = parserOptions; for (var language : parserOptions.locales) { Properties rareLemmasProperties = new Properties(); - String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + "filter_" + language + ".properties").toAbsolutePath().toString(); + String filePath = Paths.get(ParserDefaults.RESOURCES_DIR + "filter_" + language + ".properties") + .toAbsolutePath().toString(); try (var propertiesStream = new InputStreamReader(new FileInputStream(filePath), StandardCharsets.UTF_8)) { rareLemmasProperties.load(propertiesStream); for (var entry : rareLemmasProperties.entrySet()) { String key = entry.getKey().toString(); String value = entry.getValue().toString(); - switch (value) { - case "rare" : rareLemmas.add(key); break; - case "omit" : omitLemmas.add(key); break; - default: throw new IllegalArgumentException(key + ": Unknown key value " + value); + if (value.matches("L[0-9]+")) { + var values = Arrays.asList(value.split(",")); + mergeMap.computeIfAbsent(key, v -> new ArrayList<>()).addAll(values); + deferredLexemes.add(key); + deferredLexemes.addAll(values); + } else { + switch (value) { + case "rare": { + rareLemmas.add(key); + break; + } + case "omit": { + omitLemmas.add(key); break; + } + default: { + throw new IllegalArgumentException(key + ": Unknown key value " + value); + } + } } } - } - catch (IOException e) { + } catch (IOException e) { // else oh well. It doesn't matter. } } @@ -111,6 +133,11 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) { // We really don't want this junk. return; } + if (deferredLexemes.contains(lexeme.id)) { + deferredLexemes.remove(lexeme.id); + lexemeMap.put(lexeme.id, new SimpleEntry<>(lexeme, lineNumber)); + return; + } Lemma lemma = new Lemma(); Set> partOfSpeechSet = null; for (var lemmaEntry : lexeme.lemmas.entrySet()) { @@ -122,7 +149,8 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) { if (partOfSpeechSet == null) { partOfSpeechSet = Grammar.getMappedGrammemes(lexeme.lexicalCategory); if (partOfSpeechSet == null) { - throw new IllegalArgumentException(lexeme.lexicalCategory + " is not a known part of speech grammeme for " + lexeme.id + "(" + lemma.value + ")"); + throw new IllegalArgumentException(lexeme.lexicalCategory + + " is not a known part of speech grammeme for " + lexeme.id + "(" + lemma.value + ")"); } } lemma.grammemes.addAll(partOfSpeechSet); @@ -134,7 +162,9 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) { var variant = Grammar.getMappedGrammemes(additionalCategory); if (variant == null) { if (parserOptions.debug) { - System.err.println("Line " + lineNumber + ": " + additionalCategory + " is not a known grammeme for the language variant " + lexeme.id + "(" + lemma.value + ")"); + System.err.println("Line " + lineNumber + ": " + additionalCategory + + " is not a known grammeme for the language variant " + lexeme.id + "(" + lemma.value + + ")"); } continue; } @@ -144,7 +174,8 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) { lemma.grammemes.add(Grammar.Usage.RARE); } extractImportantProperties(lexeme.claims, lemma.grammemes, lexeme.id, lemma.value); - if (lemma.grammemes.contains(Ignorable.IGNORABLE_LEMMA) || lemma.grammemes.contains(Ignorable.IGNORABLE_INFLECTION)) { + if (lemma.grammemes.contains(Ignorable.IGNORABLE_LEMMA) + || lemma.grammemes.contains(Ignorable.IGNORABLE_INFLECTION)) { documentState.unusableLemmaCount++; continue; } @@ -155,8 +186,7 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) { var representation = form.representations.get(currentLemmaLanguage); if (representation != null) { currentInflection = new Inflection(representation.value); - } - else { + } else { // Couldn't find an exact match. Go to a generic match. for (var rep : form.representations.entrySet()) { if (isContained(currentLemmaLanguage, lemmaEntry.getKey())) { @@ -184,17 +214,21 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) { currentInflection.grammemeSet.remove(Grammar.Usage.RARE); } currentInflection.grammemeSet.remove(Ignorable.IGNORABLE_PROPERTY); - var grammemeExpansion = parserOptions.expandGramemes != null ? parserOptions.expandGramemes.get(currentInflection.grammemeSet) : null; - if (parserOptions.addSound && form.claims != null && !form.claims.isEmpty() && currentInflection.inflection.charAt(0) == lemma.value.charAt(0)) { - // We have potential data, and the words aren't mixed together. So this is probably accurate. + var grammemeExpansion = parserOptions.expandGramemes != null + ? parserOptions.expandGramemes.get(currentInflection.grammemeSet) + : null; + if (parserOptions.addSound && form.claims != null && !form.claims.isEmpty() + && currentInflection.inflection.charAt(0) == lemma.value.charAt(0)) { + // We have potential data, and the words aren't mixed together. So this is + // probably accurate. addSound(form.claims, currentInflection.grammemeSet, lexeme.id, lemma.value); } if (grammemeExpansion == null) { lemma.inflections.add(currentInflection); - } - else { + } else { for (var grammemeSet : grammemeExpansion) { - var expandedInflection = new Inflection(currentInflection.inflection, currentInflection.rareUsage); + var expandedInflection = new Inflection(currentInflection.inflection, + currentInflection.rareUsage); expandedInflection.grammemeSet.addAll(currentInflection.grammemeSet); expandedInflection.grammemeSet.addAll(grammemeSet); lemma.inflections.add(expandedInflection); @@ -213,11 +247,46 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) { analyzeLemma(lemma); } } + private void moveLexemeClaimsToForms(Lexeme lexeme) { + for (LexemeForm form : lexeme.forms) { + for (Map.Entry> entry : lexeme.claims.entrySet()) { + String key = entry.getKey(); + if (form.claims == null) { + form.claims = new HashMap<>(); + } + form.claims.computeIfAbsent(key, k -> new ArrayList<>()).addAll(entry.getValue()); + } + } + lexeme.claims.clear(); + } + private Lexeme mergeLexemes(Lexeme lexeme1, Lexeme lexeme2) { + moveLexemeClaimsToForms(lexeme2); + lexeme1.forms.addAll(lexeme2.forms); // Combine forms + return lexeme1; + } + // Method to process and merge lexemes + private void processAndMergeLexemes() { + for (Map.Entry> entry : mergeMap.entrySet()) { + SimpleEntry pair = lexemeMap.computeIfAbsent(entry.getKey(), key -> { + throw new IllegalArgumentException(key + ": id not found"); + }); + Lexeme mergedLexeme = pair.getKey(); + int lineNumber = pair.getValue(); + for (var value : entry.getValue()) { + mergeLexemes(mergedLexeme, lexemeMap.computeIfAbsent(value, key -> { + throw new IllegalArgumentException(key + ": id not found"); + }).getKey()); + } + analyzeLexeme(lineNumber, mergedLexeme); + } + } /** - * When there are multiple genders at the lemma level, it's a ranking system instead of applying to all forms. + * When there are multiple genders at the lemma level, it's a ranking system + * instead of applying to all forms. * Such data is useless. So we should ignore it. - * When there are multiple genders at the form level, the same form is valid for all specified genders. + * When there are multiple genders at the form level, the same form is valid for + * all specified genders. */ private void removeConflicts(TreeSet> grammemes, Class grammemeType) { if (grammemes.size() > 1) { @@ -252,7 +321,8 @@ private void convertGrammemes(LexemeForm form, Inflection currentInflection, Str extractImportantProperties(form.claims, currentInflection.grammemeSet, id, lemma); } - private void extractImportantProperties(Map> claims, TreeSet> grammemes, String id, String lemma) { + private void extractImportantProperties(Map> claims, TreeSet> grammemes, String id, + String lemma) { if (claims == null || claims.isEmpty()) { return; } @@ -263,10 +333,11 @@ private void extractImportantProperties(Map> claims, TreeSe var grammemeEnum = Grammar.getMappedGrammemes(grammemeStr); if (grammemeEnum != null) { grammemes.addAll(grammemeEnum); - } - else if (parserOptions.debug) { - // Most of this is irrelevant non-grammatical information, like that it's a trademark, or a study of something, - // but sometimes it contains grammemes that apply to all words, like grammatical gender. + } else if (parserOptions.debug) { + // Most of this is irrelevant non-grammatical information, like that it's a + // trademark, or a study of something, + // but sometimes it contains grammemes that apply to all words, like grammatical + // gender. System.err.println(grammemeStr + " is not a known grammeme for " + id + "(" + lemma + ")"); } } @@ -300,7 +371,8 @@ private static boolean validateStemLength(@Nonnull List inflections, boolean invalid = false; for (var inflection_inner : inflections) { var inflectionInnerStr = inflection_inner.getInflection(); - if (inflectionInnerStr.endsWith(suffix) && ((inflectionInnerStr.length() - suffix.length()) < stemLength)) { + if (inflectionInnerStr.endsWith(suffix) + && ((inflectionInnerStr.length() - suffix.length()) < stemLength)) { invalid = true; break; } @@ -312,8 +384,9 @@ private static boolean validateStemLength(@Nonnull List inflections, return true; } - //Provided lemma and all it's surface forms, return the length of the longest common prefix among them - private static int getStemLength(String lemma, @Nonnull List inflections){ + // Provided lemma and all it's surface forms, return the length of the longest + // common prefix among them + private static int getStemLength(String lemma, @Nonnull List inflections) { String[] stringList = new String[inflections.size() + 1]; for (int i = 0; i < inflections.size(); i++) { stringList[i] = inflections.get(i).getInflection(); @@ -342,11 +415,13 @@ private static List generateSuffixes(int stemLength, @Nonnull List generateSuffixes(int stemLength, @Nonnull List inflections, EnumSet posToBeInflected) { + // Check whether the surface forms to be inflected or not + private static boolean containsImportant(@Nonnull List inflections, + EnumSet posToBeInflected) { for (Inflection inflection : inflections) { if (!Collections.disjoint(posToBeInflected, inflection.getGrammemeSet())) { return true; @@ -367,17 +443,18 @@ private static boolean containsImportant(@Nonnull List inflections, return false; } - //Given lemma suffix and surface form suffixes, return either an existing inflection pattern or return a new one while adding to the existing inflection patterns + // Given lemma suffix and surface form suffixes, return either an existing + // inflection pattern or return a new one while adding to the existing + // inflection patterns private InflectionPattern getInflectionPattern(Lemma lemma, String lemmaSuffix, - List suffixes) { + List suffixes) { TreeSet> newGrammemeList = new TreeSet<>(lemma.grammemes); InflectionPattern inflectionPattern = new InflectionPattern( documentState.inflectionPatterns.size() + 1, lemmaSuffix, newGrammemeList, - suffixes - ); + suffixes); int idx = documentState.inflectionPatterns.indexOf(inflectionPattern); @@ -403,7 +480,7 @@ private void analyzeInflections(Lemma lemma, List inputInflections) if (!inflections.isEmpty()) { ArrayList nonEmptyInflections = new ArrayList<>(); // Adding lemma grammemes to all inflections - for (int i = 0; i < inflections.size() ; i++) { + for (int i = 0; i < inflections.size(); i++) { var inflection = inflections.get(i); var inflectionGrammemes = inflection.getGrammemeSet(); if (!inflectionGrammemes.isEmpty() && !InflectionPattern.isIgnorableGrammemeSet(inflectionGrammemes)) { @@ -412,9 +489,10 @@ private void analyzeInflections(Lemma lemma, List inputInflections) } inflectionGrammemes.addAll(lemma.grammemes); } - // If all inflections are empty then add all significant inflections to the pattern + // If all inflections are empty then add all significant inflections to the + // pattern if (nonEmptyInflectionIndices.isEmpty()) { - for (int i = 0; i < inflections.size() ; i++) { + for (int i = 0; i < inflections.size(); i++) { var inflection = inflections.get(i); if (!InflectionPattern.isIgnorableGrammemeSet(inflection.getGrammemeSet())) { nonEmptyInflections.add(inflection); @@ -428,16 +506,18 @@ private void analyzeInflections(Lemma lemma, List inputInflections) List suffixes = generateSuffixes(stemLength, nonEmptyInflections); inflectionPattern = getInflectionPattern(lemma, lemma.value.substring(stemLength), - suffixes - ); + suffixes); } - // else ignore this unimportant inflection pattern. This is usually trimmed for size. + // else ignore this unimportant inflection pattern. This is usually trimmed for + // size. } - for (int i = 0; i < inflections.size() ; i++) { + for (int i = 0; i < inflections.size(); i++) { var inflection = inflections.get(i); String phrase = inflection.getInflection(); - InflectionPattern inflectionPatternForDict = nonEmptyInflectionIndices.contains(i) ? inflectionPattern : null; - documentState.addDictionaryEntry(new DictionaryEntry(phrase, phrase, lemma.isRare, inflection.getGrammemeSet(), inflectionPatternForDict)); + InflectionPattern inflectionPatternForDict = nonEmptyInflectionIndices.contains(i) ? inflectionPattern + : null; + documentState.addDictionaryEntry(new DictionaryEntry(phrase, phrase, lemma.isRare, + inflection.getGrammemeSet(), inflectionPatternForDict)); } } @@ -457,7 +537,7 @@ private List enumerateInflectionsForGrammemeCombinations(Inflection for (List> list : results) { list.add(grammeme); } - }else { + } else { newResults.clear(); for (List> list : results) { ArrayList> newList = new ArrayList<>(grammemeSet.size()); @@ -482,15 +562,15 @@ private void addGrammeme(TreeSet> grammemes, @Nullable String grammeme) Enum value = Grammar.DEFAULTMAP.get(grammeme); if (value == null) { throw new NullPointerException(grammeme + " is not a known grammeme"); - } - else if (!value.equals(Ignorable.IGNORABLE_PROPERTY)) { + } else if (!value.equals(Ignorable.IGNORABLE_PROPERTY)) { grammemes.add(value); } } } private void mergeAdditionalGrammemes() { - // Add any entries that are missing. The actual properties will be added elsewhere. + // Add any entries that are missing. The actual properties will be added + // elsewhere. TreeSet> grammemes = new TreeSet<>(EnumComparator.ENUM_COMPARATOR); for (var entry : parserOptions.additionalGrammemesDict.entrySet()) { grammemes.clear(); @@ -546,8 +626,10 @@ public static void main(String[] args) throws Exception { var lexParser = new ParseWikidata(parserOptions); LexemesJsonDeserializer.setLanguage(parserOptions.locales); - // We create InputSource directly due to an occasional bugs with UTF-8 files being interpreted as malformed UTF-8. - // We use a large buffer because we're reading a large file, and we're frequently reading file data. + // We create InputSource directly due to an occasional bugs with UTF-8 files + // being interpreted as malformed UTF-8. + // We use a large buffer because we're reading a large file, and we're + // frequently reading file data. for (String sourceFilename : parserOptions.sourceFilenames) { try (InputStream fileInputStream = new FileInputStream(sourceFilename)) { InputStream inputStream = fileInputStream; @@ -567,10 +649,11 @@ public static void main(String[] args) throws Exception { lexParser.analyzeLexeme(parser.currentLocation().getLineNr(), lexeme); } catch (IllegalArgumentException e) { lexParser.documentState.unusableLemmaCount++; - System.err.println("Line " + parser.currentLocation().getLineNr() + ": " + e.getMessage()); + System.err.println( + "Line " + parser.currentLocation().getLineNr() + ": " + e.getMessage()); } - } - while (parser.nextToken() != JsonToken.END_ARRAY); + } while (parser.nextToken() != JsonToken.END_ARRAY); + lexParser.processAndMergeLexemes(); } } } @@ -586,4 +669,3 @@ public static void main(String[] args) throws Exception { } } } - diff --git a/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_de.properties b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_de.properties new file mode 100644 index 00000000..77a496e4 --- /dev/null +++ b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_de.properties @@ -0,0 +1,20 @@ +# Copyright 2025 Unicode Incorporated and others. All rights reserved. + +#organisator = organisatorin +#Eigentümer = Eigentümerin +# Autor = Autorin +#Teilnehmer = Teilnehmerin +#Freund = Freundin +#Ehemann = Ehefrau +#Benutzer = Benutzerin +#Organspender = Organspenderin +#Besucher = Besucherin +L313979=L481883 +L296285=L833806 +L34181=L34182 +L447531=L481654 +L58087=L58088 +L484250=L252570 +L44834=L494386 +L860063=L931664 +L2272=L295129 diff --git a/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_it.properties b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_it.properties new file mode 100644 index 00000000..7dcc0c06 --- /dev/null +++ b/inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_it.properties @@ -0,0 +1,4 @@ +# Copyright 2025 Unicode Incorporated and others. All rights reserved. + +#attore(masculine) = attrice(feminine) +L1101749=L202915 \ No newline at end of file