Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ public String toString() {

enum Tense {
PAST,
DISTANT_PAST,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hindi concept for one word.

PRESENT,
FUTURE;

Expand Down Expand Up @@ -233,7 +234,8 @@ public String toString() {
}

enum FormType {
SHORT_FORM;
SHORT_FORM,
LONG_FORM;

private final String printableValue;
FormType() {
Expand Down Expand Up @@ -591,10 +593,12 @@ public String toString() {
TYPEMAP.put("Q576271", EnumSet.of(PartOfSpeech.DETERMINER));
TYPEMAP.put("Q5051", new HashSet<>(Arrays.asList(Case.GENITIVE, PartOfSpeech.DETERMINER))); // possessive determiner
TYPEMAP.put("Q2824480", EnumSet.of(PartOfSpeech.DETERMINER)); // demonstrative adjective, but it's really a determiner.
TYPEMAP.put("Q2112896", EnumSet.of(PartOfSpeech.DETERMINER)); // pronominal adverb
TYPEMAP.put("Q83034", EnumSet.of(PartOfSpeech.INTERJECTION));
TYPEMAP.put("Q2304610", EnumSet.of(PartOfSpeech.INTERROGATIVE));
TYPEMAP.put("Q12021746", EnumSet.of(PartOfSpeech.INTERROGATIVE));
TYPEMAP.put("Q54310231", EnumSet.of(PartOfSpeech.INTERROGATIVE, PartOfSpeech.PRONOUN));
TYPEMAP.put("Q60798917", EnumSet.of(PartOfSpeech.INTERROGATIVE)); // question tag
TYPEMAP.put("Q9788", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // letter
TYPEMAP.put("Q3241972", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // character
TYPEMAP.put("Q1084", EnumSet.of(PartOfSpeech.NOUN));
Expand All @@ -614,13 +618,19 @@ public String toString() {
TYPEMAP.put("Q10535365", EnumSet.of(PartOfSpeech.PARTICLE)); // infinitive marker, infinitive participle, infinitive particle
TYPEMAP.put("Q113198319", new HashSet<>(Arrays.asList(PartOfSpeech.ADVERB, PartOfSpeech.PARTICLE))); // adverbial particle
TYPEMAP.put("Q115762248", EnumSet.of(PartOfSpeech.PARTICLE)); // vocative particle
TYPEMAP.put("Q115475265", EnumSet.of(PartOfSpeech.PARTICLE)); // honorific particle
TYPEMAP.put("Q113076880", EnumSet.of(PartOfSpeech.ADVERB)); // postpositive adverb
TYPEMAP.put("Q65807752", EnumSet.of(PartOfSpeech.ADVERB)); // demonstrative adverb
TYPEMAP.put("Q117321826", EnumSet.of(PartOfSpeech.ADVERB)); // localiser, similar to an adverb
TYPEMAP.put("Q134316", EnumSet.of(PartOfSpeech.ADPOSITION)); // adposition
TYPEMAP.put("Q161873", EnumSet.of(PartOfSpeech.ADPOSITION)); // postposition
TYPEMAP.put("Q4833830", EnumSet.of(PartOfSpeech.ADPOSITION)); // preposition
TYPEMAP.put("Q36224", EnumSet.of(PartOfSpeech.PRONOUN));
TYPEMAP.put("Q2006180", EnumSet.of(PartOfSpeech.PRONOUN)); // pro-form, word that substitutes for another word, broader scope than pronoun
TYPEMAP.put("Q115272253", EnumSet.of(PartOfSpeech.PRONOUN)); // possessive adjective, like "your"
TYPEMAP.put("Q2824485", EnumSet.of(PartOfSpeech.PRONOUN)); // pronominal adjective
TYPEMAP.put("Q115272205", EnumSet.of(PartOfSpeech.PRONOUN)); // reflexive adjective
TYPEMAP.put("Q79377411", EnumSet.of(PartOfSpeech.PRONOUN)); // demonstrative pronoun
TYPEMAP.put("Q147276", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // proper noun
TYPEMAP.put("Q7884789", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // toponym
TYPEMAP.put("Q43229", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // organization
Expand All @@ -634,9 +644,11 @@ public String toString() {
TYPEMAP.put("Q3254028", EnumSet.of(PartOfSpeech.VERB)); // separable verb, verb with a prefix which separates from the core verb in certain positions in a sentence

TYPEMAP.put("Q4239848", new HashSet<>(Arrays.asList(FormType.SHORT_FORM, PartOfSpeech.ADJECTIVE))); // short form of an adjective
TYPEMAP.put("Q96406487", EnumSet.of(FormType.SHORT_FORM)); // short form
TYPEMAP.put("Q112154", EnumSet.of(FormType.SHORT_FORM)); // apocope, loss of word-final sounds
TYPEMAP.put("Q650250", EnumSet.of(FormType.SHORT_FORM)); // elision, omission of one or more sounds in a word
TYPEMAP.put("Q114092330", EnumSet.of(FormType.SHORT_FORM)); // prevocalic form, linguistic feature marking a linguistic unit as appearing only before vowels
TYPEMAP.put("Q96406455", EnumSet.of(FormType.LONG_FORM)); // long form

TYPEMAP.put("Q109267112", EnumSet.of(Polarity.AFFIRMATIVE));
TYPEMAP.put("Q1478451", EnumSet.of(Polarity.NEGATIVE));
Expand Down Expand Up @@ -745,11 +757,16 @@ public String toString() {
TYPEMAP.put("Q1230649", new HashSet<>(Arrays.asList(Tense.PAST, VerbType.PARTICIPLE)));
TYPEMAP.put("Q72249355", new HashSet<>(Arrays.asList(Voice.ACTIVE, VerbType.PARTICIPLE)));
TYPEMAP.put("Q72249544", new HashSet<>(Arrays.asList(Voice.PASSIVE, VerbType.PARTICIPLE)));
TYPEMAP.put("Q430255", new HashSet<>(Arrays.asList(Tense.PRESENT, Voice.ACTIVE, VerbType.PARTICIPLE))); // present active participle
TYPEMAP.put("Q117824585", new HashSet<>(Arrays.asList(Tense.PRESENT, Voice.PASSIVE, VerbType.PARTICIPLE))); // present active participle
TYPEMAP.put("Q16086106", new HashSet<>(Arrays.asList(Tense.PAST, Voice.PASSIVE, VerbType.PARTICIPLE))); // past active participle
TYPEMAP.put("Q113133303", EnumSet.of(VerbType.PARTICIPLE)); // conjunctive participle
TYPEMAP.put("Q192613", EnumSet.of(Tense.PRESENT)); // present tense
TYPEMAP.put("Q3910936", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.PRESENT))); // simple present and usually future
TYPEMAP.put("Q1994301", EnumSet.of(Tense.PAST)); // past tense
TYPEMAP.put("Q1392475", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.PAST))); // simple past
TYPEMAP.put("Q113326559", EnumSet.of(Tense.PAST)); // non-remote tense
TYPEMAP.put("Q113326099", EnumSet.of(Tense.DISTANT_PAST)); // remote tense
TYPEMAP.put("Q501405", EnumSet.of(Tense.FUTURE)); // future tense
TYPEMAP.put("Q344", EnumSet.of(Tense.FUTURE)); // future
TYPEMAP.put("Q1475560", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.FUTURE))); // simple future
Expand All @@ -767,6 +784,8 @@ public String toString() {
TYPEMAP.put("Q115223950", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // infinitive form in Norwegian Nynorsk that ends in 'a'
TYPEMAP.put("Q115223951", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // infinitive form in Norwegian Nynorsk that ends in 'e'
TYPEMAP.put("Q1923028", EnumSet.of(VerbType.GERUND));
TYPEMAP.put("Q380012", EnumSet.of(VerbType.GERUND)); // adverbial
TYPEMAP.put("Q904896", EnumSet.of(VerbType.GERUND)); // transgressive, adverbial participle
TYPEMAP.put("Q52434511", new HashSet<>(Arrays.asList(Tense.PRESENT, VerbType.GERUND)));
TYPEMAP.put("Q52434598", new HashSet<>(Arrays.asList(Tense.PAST, VerbType.GERUND)));
TYPEMAP.put("Q1050494", EnumSet.of(VerbType.NONFINITE));
Expand All @@ -791,7 +810,7 @@ public String toString() {

TYPEMAP.put("Q468801", EnumSet.of(PartOfSpeech.PRONOUN)); // personal pronoun
TYPEMAP.put("Q1502460", EnumSet.of(PartOfSpeech.PRONOUN)); // possessive pronoun
TYPEMAP.put("Q34793275", new HashSet<>(Arrays.asList(Definiteness.DEMONSTRATIVE, PartOfSpeech.PRONOUN))); // demonstrative pronoun
TYPEMAP.put("Q34793275", EnumSet.of(PartOfSpeech.PRONOUN)); // demonstrative pronoun
TYPEMAP.put("Q953129", EnumSet.of(PartOfSpeech.PRONOUN)); // reflexive pronoun
TYPEMAP.put("Q130266209", EnumSet.of(PartOfSpeech.PRONOUN)); // reflexive personal pronoun
TYPEMAP.put("Q1050744", EnumSet.of(PartOfSpeech.PRONOUN)); // relative pronoun
Expand All @@ -816,6 +835,7 @@ public String toString() {
TYPEMAP.put("Q3502544", new HashSet<>(Arrays.asList(Tense.PAST, Mood.SUBJUNCTIVE))); // past subjunctive
TYPEMAP.put("Q3502541", new HashSet<>(Arrays.asList(Aspect.IMPERFECT, Tense.PAST, Mood.SUBJUNCTIVE))); // imperfect subjunctive
TYPEMAP.put("Q113289507", EnumSet.of(Mood.EMPHATIC));
TYPEMAP.put("Q113959607", EnumSet.of(Mood.EMPHATIC)); // emphatic particle
TYPEMAP.put("Q2532941", EnumSet.of(Mood.VOLITIVE));

TYPEMAP.put("Q5636904", EnumSet.of(Aspect.HABITUAL));
Expand Down Expand Up @@ -889,6 +909,9 @@ public String toString() {
TYPEMAP.put("Q12237354", EnumSet.of(Usage.RARE)); // obsolete word
TYPEMAP.put("Q54943392", EnumSet.of(Usage.RARE)); // obsolete form
TYPEMAP.put("Q181970", EnumSet.of(Usage.RARE)); // archaism
TYPEMAP.put("Q1098772", EnumSet.of(Usage.RARE)); // broken plural
TYPEMAP.put("Q54944750", EnumSet.of(Usage.RARE)); // potential form
TYPEMAP.put("Q55074511", EnumSet.of(Usage.RARE)); // reconstructed word

// Phrases and other things that don't inflect
TYPEMAP.put("Q101352", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // family name. Lots of them conflict with common nouns, like "light"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import java.util.TreeMap;
import java.util.TreeSet;

import static org.unicode.wikidata.Grammar.Gender;
import static org.unicode.wikidata.Grammar.Ignorable;
import static org.unicode.wikidata.Grammar.PartOfSpeech;
import static org.unicode.wikidata.Grammar.Sound;
Expand All @@ -48,7 +49,8 @@ public final class ParseWikidata {
static final Set<String> PROPERTIES_WITH_GRAMMEMES = new TreeSet<>(List.of(
"P31", // instance of. Sometimes phrase information is here.
"P1552", // has characteristic for animacy
"P5185" // grammatical gender
"P5185", // grammatical gender
"P11054" // grammatical number
));
static final Set<String> IMPORTANT_PROPERTIES = new TreeSet<>(PROPERTIES_WITH_GRAMMEMES);

Expand Down Expand Up @@ -147,6 +149,7 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
continue;
}
lemma.grammemes.remove(Ignorable.IGNORABLE_PROPERTY);
removeConflicts(lemma.grammemes, Gender.class);
for (var form : lexeme.forms) {
Inflection currentInflection = null;
var representation = form.representations.get(currentLemmaLanguage);
Expand Down Expand Up @@ -211,6 +214,33 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
}
}

/**
* When there are multiple genders at the lemma level, it's a ranking system instead of applying to all forms.
* Such data is useless. So we should ignore it.
* When there are multiple genders at the form level, the same form is valid for all specified genders.
*/
private void removeConflicts(TreeSet<Enum<?>> grammemes, Class<?> grammemeType) {
if (grammemes.size() > 1) {
var iter = grammemes.iterator();
int count = 0;
while (iter.hasNext()) {
var grammeme = iter.next();
if (grammemeType.isInstance(grammeme)) {
count++;
}
}
if (count > 1) {
iter = grammemes.iterator();
while (iter.hasNext()) {
var grammeme = iter.next();
if (grammemeType.isInstance(grammeme)) {
iter.remove();
}
}
}
}
}

private void convertGrammemes(LexemeForm form, Inflection currentInflection, String id, String lemma) {
for (var feature : form.grammaticalFeatures) {
Set<? extends Enum<?>> values = Grammar.getMappedGrammemes(feature);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
#
# These are lexemes that should either be ignored due to irrelevance that can't be easily tagged as irrelevant,
# or words that are just not that common that should be sorted last in the inflection patterns.
L15388=rare
L299075=omit
# TODO remove this, since it is fixed upstream.
L342586=omit
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be removed after the next Wikidata dump is consumed.


L299075=omit
L468896=omit
L469033=omit
L469036=omit
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright 2025 Unicode Incorporated and others. All rights reserved.
#
# These are lexemes that should either be ignored due to irrelevance that can't be easily tagged as irrelevant,
# or words that are just not that common that should be sorted last in the inflection patterns.
L128740=omit
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This noun is not typical, and it conflicts with the common pronoun. Remove it for now to deconflict it.

L166820=omit
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is стола, which is also the feminine form of стол (table). There is a conflict here. There are ways to deconflict this, but let's exclude this for now.