Skip to content

Commit 0265a80

Browse files
committed
Build a set of satellite offsets
1 parent 46e220c commit 0265a80

File tree

1 file changed

+28
-9
lines changed

1 file changed

+28
-9
lines changed

nltk/corpus/reader/wordnet.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,9 @@ def __init__(self, root, omw_reader):
12041204
assert int(index) == i
12051205
self._lexnames.append(lexname)
12061206

1207+
# Build a set of adjective satellite offsets
1208+
self._scan_satellites()
1209+
12071210
# Load the indices for lemmas and synset offsets
12081211
self._load_lemma_pos_offset_map()
12091212

@@ -1379,8 +1382,24 @@ def langs(self):
13791382
"""return a list of languages supported by Multilingual Wordnet"""
13801383
return list(self.provenances.keys())
13811384

1382-
def _load_lemma_pos_offset_map(self):
1385+
def _scan_satellites(self):
13831386
adj_data_file = self._data_file(ADJ)
1387+
satellite_offsets = set()
1388+
adj_data_file.seek(0)
1389+
for line in adj_data_file:
1390+
if not line.strip() or line.startswith(" "):
1391+
continue
1392+
fields = line.strip().split()
1393+
if len(fields) < 3:
1394+
continue
1395+
synset_offset = fields[0]
1396+
synset_type = fields[2]
1397+
if synset_type == "s":
1398+
satellite_offsets.add(int(synset_offset))
1399+
adj_data_file.seek(0) # Reset if needed elsewhere
1400+
self.satellite_offsets = satellite_offsets
1401+
1402+
def _load_lemma_pos_offset_map(self):
13841403
for suffix in self._FILEMAP.values():
13851404
# parse each line of the file (ignoring comment lines)
13861405
with self.open("index.%s" % suffix) as fp:
@@ -1426,15 +1445,15 @@ def _next_token():
14261445
# map lemmas and parts of speech to synsets
14271446
self._lemma_pos_offset_map[lemma][pos] = synset_offsets
14281447
if pos == ADJ:
1429-
sat_offsets = []
1430-
for offset in synset_offsets:
1431-
adj_data_file.seek(offset)
1432-
# Check in data.adj if offset pos is ADJ_SAT
1433-
if adj_data_file.readline()[12:13] == ADJ_SAT:
1434-
sat_offsets.append(offset)
1435-
if sat_offsets:
1448+
# Filter adjective satellites:
1449+
satellite_offsets = [
1450+
of for of in synset_offsets if of in self.satellite_offsets
1451+
]
1452+
if satellite_offsets:
14361453
# Duplicate only real satellites
1437-
self._lemma_pos_offset_map[lemma][ADJ_SAT] = sat_offsets
1454+
self._lemma_pos_offset_map[lemma][
1455+
ADJ_SAT
1456+
] = satellite_offsets
14381457

14391458
def _load_exception_map(self):
14401459
# load the exception file data into memory

0 commit comments

Comments
 (0)