108108 "Somebody %s VERB-ing" ,
109109 "It %s that CLAUSE" ,
110110 "Something %s INFINITIVE" ,
111+ # OEWN additions:
112+ "Somebody %s at something" ,
113+ "Somebody %s for something" ,
114+ "Somebody %s on somebody" ,
115+ "Somebody %s out of somebody" ,
111116)
112117
113118SENSENUM_RE = re .compile (r"\.[\d]+\." )
@@ -454,7 +459,7 @@ def _doc(self, doc_type, default, lang="eng"):
454459 corpus ._load_lang_data (lang )
455460 of = corpus .ss2of (self )
456461 i = corpus .lg_attrs .index (doc_type )
457- if of in corpus ._lang_data [lang ][i ]. keys () :
462+ if of in corpus ._lang_data [lang ][i ]:
458463 return corpus ._lang_data [lang ][i ][of ]
459464 else :
460465 return None
@@ -481,11 +486,11 @@ def lemma_names(self, lang="eng"):
481486 if lang == "eng" :
482487 return self ._lemma_names
483488 else :
484- self ._wordnet_corpus_reader . _load_lang_data ( lang )
485-
486- i = self . _wordnet_corpus_reader . ss2of (self , lang )
487- if i in self . _wordnet_corpus_reader ._lang_data [lang ][0 ]:
488- return self . _wordnet_corpus_reader ._lang_data [lang ][0 ][i ]
489+ reader = self ._wordnet_corpus_reader
490+ reader . _load_lang_data ( lang )
491+ i = reader . ss2of (self )
492+ if i in reader ._lang_data [lang ][0 ]:
493+ return reader ._lang_data [lang ][0 ][i ]
489494 else :
490495 return []
491496
@@ -1203,59 +1208,77 @@ def __init__(self, root, omw_reader):
12031208 # load the exception file data into memory
12041209 self ._load_exception_map ()
12051210
1211+ self .nomap = []
1212+ self .splits = {}
1213+
12061214 # map from WordNet 3.0 for OMW data
12071215 self .map30 = self .map_wn30 ()
12081216
12091217 # Language data attributes
12101218 self .lg_attrs = ["lemma" , "none" , "def" , "exe" ]
12111219
1212- def corpus2sk (self , corpus = None ):
1220+ def index_sense (self , version = None ):
12131221 """Read sense key to synset id mapping from index.sense file in corpus directory"""
12141222 fn = "index.sense"
1215- if corpus :
1223+ if version :
12161224 from nltk .corpus import CorpusReader , LazyCorpusLoader
12171225
1218- ixreader = LazyCorpusLoader (corpus , CorpusReader , r".*/" + fn )
1226+ ixreader = LazyCorpusLoader (version , CorpusReader , r".*/" + fn )
12191227 else :
12201228 ixreader = self
12211229 with ixreader .open (fn ) as fp :
1222- sk_map = {}
1230+ sensekey_map = {}
12231231 for line in fp :
1224- items = line .strip ().split (" " )
1225- sk = items [0 ]
1226- pos = self ._pos_names [int (sk .split ("%" )[1 ].split (":" )[0 ])]
1227- sk_map [sk ] = f"{ items [1 ]} -{ pos } "
1228- return sk_map
1232+ fields = line .strip ().split ()
1233+ sensekey = fields [0 ]
1234+ pos = self ._pos_names [int (sensekey .split ("%" )[1 ].split (":" )[0 ])]
1235+ sensekey_map [sensekey ] = f"{ fields [1 ]} -{ pos } "
1236+ return sensekey_map
1237+
1238+ def map_to_many (self ):
1239+ sensekey_map1 = self .index_sense ("wordnet" )
1240+ sensekey_map2 = self .index_sense ()
1241+ synset_to_many = {}
1242+ for synsetid in set (sensekey_map1 .values ()):
1243+ synset_to_many [synsetid ] = []
1244+ for sensekey in set (sensekey_map1 .keys ()).intersection (
1245+ set (sensekey_map2 .keys ())
1246+ ):
1247+ source = sensekey_map1 [sensekey ]
1248+ target = sensekey_map2 [sensekey ]
1249+ synset_to_many [source ].append (target )
1250+ return synset_to_many
1251+
1252+ def map_to_one (self ):
1253+ synset_to_many = self .map_to_many ()
1254+ synset_to_one = {}
1255+ for source in synset_to_many :
1256+ candidates_bag = synset_to_many [source ]
1257+ if candidates_bag :
1258+ candidates_set = set (candidates_bag )
1259+ if len (candidates_set ) == 1 :
1260+ target = candidates_bag [0 ]
1261+ else :
1262+ counts = []
1263+ for candidate in candidates_set :
1264+ counts .append ((candidates_bag .count (candidate ), candidate ))
1265+ self .splits [source ] = counts
1266+ target = max (counts )[1 ]
1267+ synset_to_one [source ] = target
1268+ if source [- 1 ] == "s" :
1269+ # Add a mapping from "a" to target for applications like omw,
1270+ # where only Lithuanian and Slovak use the "s" ss_type.
1271+ synset_to_one [f"{ source [:- 1 ]} a" ] = target
1272+ else :
1273+ self .nomap .append (source )
1274+ return synset_to_one
12291275
12301276 def map_wn30 (self ):
12311277 """Mapping from Wordnet 3.0 to currently loaded Wordnet version"""
12321278 if self .get_version () == "3.0" :
12331279 return None
1234- # warnings.warn(f"Mapping WN v. 3.0 to Wordnet v. {self.version}")
1235- sk1 = self .corpus2sk ("wordnet" )
1236- sk2 = self .corpus2sk ()
1237-
1238- skmap = {}
1239- for sk in set (sk1 .keys ()).intersection (set (sk2 .keys ())):
1240- of1 = sk1 [sk ]
1241- of2 = sk2 [sk ]
1242- if of1 not in skmap .keys ():
1243- skmap [of1 ] = [of2 ]
1244- else :
1245- skmap [of1 ].append (of2 )
1246-
1247- map30 = {}
1248- for of in skmap .keys ():
1249- candidates = skmap [of ]
1250- # map to candidate that covers most lemmas:
1251- of2 = max ((candidates .count (x ), x ) for x in set (candidates ))[1 ]
1252- # warnings.warn(f"Map {of} {of2}")
1253- map30 [of ] = of2
1254- if of [- 1 ] == "s" :
1255- # Add a mapping from "a" to "a" for applications like omw,
1256- # which don't use the "s" ss_type:
1257- map30 [f"{ of [:- 1 ]} a" ] = f"{ of2 [:- 1 ]} a"
1258- return map30
1280+ else :
1281+ return self .map_to_one ()
12591282
12601283 # Open Multilingual WordNet functions, contributed by
12611284 # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
@@ -1264,19 +1287,16 @@ def of2ss(self, of):
12641287 """take an id and return the synsets"""
12651288 return self .synset_from_pos_and_offset (of [- 1 ], int (of [:8 ]))
12661289
1267- def ss2of (self , ss , lang = None ):
1290+ def ss2of (self , ss ):
12681291 """return the ID of the synset"""
1269- pos = ss .pos ()
1270- # Only these 3 WordNets retain the satellite pos tag
1271- if lang not in ["nld" , "lit" , "slk" ] and pos == "s" :
1272- pos = "a"
1273- return f"{ ss .offset ():08d} -{ pos } "
1292+ if ss :
1293+ return f"{ ss .offset ():08d} -{ ss .pos ()} "
12741294
12751295 def _load_lang_data (self , lang ):
12761296 """load the wordnet data of the requested language from the file to
12771297 the cache, _lang_data"""
12781298
1279- if lang in self ._lang_data . keys () :
1299+ if lang in self ._lang_data :
12801300 return
12811301
12821302 if self ._omw_reader and not self .omw_langs :
@@ -1308,7 +1328,7 @@ def add_provs(self, reader):
13081328 file_name , file_extension = os .path .splitext (langfile )
13091329 if file_extension == ".tab" :
13101330 lang = file_name .split ("-" )[- 1 ]
1311- if lang in self .provenances . keys () or prov in ["cldr" , "wikt" ]:
1331+ if lang in self .provenances or prov in ["cldr" , "wikt" ]:
13121332 # We already have another resource for this lang,
13131333 # so we need to further specify the lang id:
13141334 lang = f"{ lang } _{ prov } "
@@ -1540,7 +1560,7 @@ def synset_from_pos_and_offset(self, pos, offset):
15401560 assert synset ._offset == offset
15411561 self ._synset_offset_cache [pos ][offset ] = synset
15421562 else :
1543- synset = Synset ( self )
1563+ synset = None
15441564 warnings .warn (f"No WordNet synset found for pos={ pos } at offset={ offset } ." )
15451565 data_file .seek (0 )
15461566 return synset
@@ -1807,16 +1827,15 @@ def all_omw_synsets(self, pos=None, lang=None):
18071827 if lang not in self .langs ():
18081828 return None
18091829 self ._load_lang_data (lang )
1810- for of in self ._lang_data [lang ][0 ]. keys () :
1811- try :
1830+ for of in self ._lang_data [lang ][0 ]:
1831+ if not pos or of [ - 1 ] == pos :
18121832 ss = self .of2ss (of )
1813- yield ss
1814- except :
1815- # A few OMW offsets don't exist in Wordnet 3.0.
1816- # Additionally, when mapped to later Wordnets,
1817- # increasing numbers of synsets are lost in the mapping.
1818- # warnings.warn(f"Language {lang}: no synset found for {of}")
1819- pass
1833+ if ss :
1834+ yield ss
1835+
1836+ # else:
1837+ # A few OMW offsets don't exist in Wordnet 3.0.
1838+ # warnings.warn(f"Language {lang}: no synset found for {of}")
18201839
18211840 def all_synsets (self , pos = None , lang = "eng" ):
18221841 """Iterate over all synsets with a given part of speech tag.
@@ -1840,12 +1859,14 @@ def all_eng_synsets(self, pos=None):
18401859 # generate all synsets for each part of speech
18411860 for pos_tag in pos_tags :
18421861 # Open the file for reading. Note that we can not re-use
1843- # the file poitners from self._data_file_map here, because
1862+ # the file pointers from self._data_file_map here, because
18441863 # we're defining an iterator, and those file pointers might
18451864 # be moved while we're not looking.
18461865 if pos_tag == ADJ_SAT :
1847- pos_tag = ADJ
1848- fileid = "data.%s" % self ._FILEMAP [pos_tag ]
1866+ pos_file = ADJ
1867+ else :
1868+ pos_file = pos_tag
1869+ fileid = "data.%s" % self ._FILEMAP [pos_file ]
18491870 data_file = self .open (fileid )
18501871
18511872 try :
@@ -1865,12 +1886,11 @@ def all_eng_synsets(self, pos=None):
18651886 # adjective satellites are in the same file as
18661887 # adjectives so only yield the synset if it's actually
18671888 # a satellite
1868- if synset ._pos == ADJ_SAT :
1889+ if pos_tag == ADJ_SAT and synset ._pos == ADJ_SAT :
18691890 yield synset
1870-
18711891 # for all other POS tags, yield all synsets (this means
18721892 # that adjectives also include adjective satellites)
1873- else :
1893+ elif pos_tag != ADJ_SAT :
18741894 yield synset
18751895 offset = data_file .tell ()
18761896 line = data_file .readline ()
@@ -2187,13 +2207,24 @@ def custom_lemmas(self, tab_file, lang):
21872207 offset_pos , label = triple [:2 ]
21882208 val = triple [- 1 ]
21892209 if self .map30 :
2190- if offset_pos in self .map30 . keys () :
2210+ if offset_pos in self .map30 :
21912211 # Map offset_pos to current Wordnet version:
21922212 offset_pos = self .map30 [offset_pos ]
21932213 else :
2194- # Synsets with no mapping keep their Wordnet 3.0 offset
2195- # warnings.warn(f"No map for {offset_pos}, {lang}: {lemma}")
2196- pass
2214+ # Some OMW offsets were never in Wordnet:
2215+ if (
2216+ offset_pos not in self .nomap
2217+ and offset_pos .replace ("a" , "s" ) not in self .nomap
2218+ ):
2219+ warnings .warn (
2220+ f"{ lang } : invalid offset { offset_pos } in '{ line } '"
2221+ )
2222+ continue
2223+ elif offset_pos [- 1 ] == "a" :
2224+ wnss = self .of2ss (offset_pos )
2225+ if wnss and wnss .pos () == "s" : # Wordnet pos is "s"
2226+ # Label OMW adjective satellites back to their Wordnet pos ("s")
2227+ offset_pos = self .ss2of (wnss )
21972228 pair = label .split (":" )
21982229 attr = pair [- 1 ]
21992230 if len (pair ) == 1 or pair [0 ] == lg :
0 commit comments