lexicon/lexicon.py at master · jcjohnston/lexicon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
# lexicon.py jcj 2019-02-20, 2020-01-23, 2020-02-07, 2020-05-10

'''A class to implement a lexicon with methods for lookup of
definitions with examples, synonyms and other related words,
pronunciations, homophones, anagrams, and regular expressions,
using a user-supplied lexicon and/or the facilities of WordNet'''

# Standard-library imports
import sys
import re
import bisect
import unicodedata as ud
from collections import namedtuple, defaultdict, OrderedDict

from my.constants import StrConsts

# WordNet-related stuff (may be unavailable)
WORDNET = False
try:
    from nltk.corpus import wordnet as wn
    from nltk.stem import WordNetLemmatizer
    WORDNET = True
except ImportError:
    pass

if WORDNET:
    wnl = WordNetLemmatizer()
    WordNetInfo = namedtuple('WordNetInfo', 'base, pos, number, info')
    WNPARAMS = namedtuple('WNPARAMS', 'GROUP, EXTENT, LEVEL, OUTTYPE')
    WNG = StrConsts('defs, rels')         # group to which relation belongs
    WNE = StrConsts('head, syns')         # extent of lemmas returned
    WNL = StrConsts('syn, lem, sub, oth') # level from which relation is drawn
    WNKINDS = OrderedDict([
        ('definition',          WNPARAMS(WNG.DEFS, WNE.HEAD, WNL.SYN, list)),
        ('examples',            WNPARAMS(WNG.DEFS, WNE.HEAD, WNL.SYN, list)),
        ('frame_strings',       WNPARAMS(WNG.DEFS, WNE.HEAD, WNL.LEM, list)),
        ('synonyms',            WNPARAMS(WNG.RELS, WNE.SYNS, WNL.OTH, str)),
        ('antonyms',            WNPARAMS(WNG.RELS, WNE.SYNS, WNL.LEM, str)),
        ('pertainyms',          WNPARAMS(WNG.RELS, WNE.SYNS, WNL.LEM, str)),
        ('hypernyms',           WNPARAMS(WNG.RELS, WNE.SYNS, WNL.SUB, str)),
        ('hyponyms',            WNPARAMS(WNG.RELS, WNE.SYNS, WNL.SUB, str)),
        ('part_meronyms',       WNPARAMS(WNG.RELS, WNE.SYNS, WNL.SUB, str)),
        ('part_holonyms',       WNPARAMS(WNG.RELS, WNE.SYNS, WNL.SUB, str)),
        ('substance_meronyms',  WNPARAMS(WNG.RELS, WNE.SYNS, WNL.SUB, str)),
        ('substance_holonyms',  WNPARAMS(WNG.RELS, WNE.SYNS, WNL.SUB, str)),
        ('entailments',         WNPARAMS(WNG.RELS, WNE.SYNS, WNL.SUB, str)),
        ('derivationally_related_forms',
                                WNPARAMS(WNG.DEFS, WNE.HEAD, WNL.LEM, str)),
        ])
    WNCATS = { 'n': 'N', 'v': 'V', 'a': 'Adj', 's': 'Sat', 'r': 'Adv' }

# Constants for busyWait callable
PHASES = 5     # How many times is each entry processed?
INTERVAL = 5   # How often in % is progress notified?
READING, NORMALIZING, SORTING, HASHING, DONE = 0, 1, 2, 3, 4   # The phases

__all__ = ['Lexicon', 'LexiconError']

class LexiconError(Exception):
    pass

class Lexicon:

    def __init__(self, options=None, busyWait=None):
        '''Initialize an object representing a lexicon from a disk file'''
        if not options: options = {}
        self.fileName = options.get('lexicon', '')
        self.caseBlind = options.get('icase')
        self.diacFilter = {}
        if options.get('idiac'):
            self.diacFilter = dict.fromkeys(c for c in range(sys.maxunicode)
                                            if ud.combining(chr(c)))
        self.language = options.get('language', 'eng') if WORDNET else ''
        self.words = []                  # list of sorted, normalized words
        self.refs = defaultdict(list)    # dict from normalized words
                                         # to lists of reference forms
                                         # eg POLISH -> [Polish, polish]
        self.anags = defaultdict(list)   # dict from normalized anagrams
                                         # to lists of anagrammatic forms
                                         # eg abeert -> [beater, berate, rebate]
        self.prons = defaultdict(list)   # spelling -> pronunciations
        self.spells = defaultdict(list)  # pronunciation -> spellings
        self.numLines = self.numProns = self.numVars = 0
        if self.fileName:
            self.readLexicon(self.fileName, busyWait)

    def readLexicon(self, fileName, busyWait):
        '''Read in a lexicon file and initialize various structures'''
        if not busyWait:
            busyWait = lambda phase, percent: None
        try:
            f = open(self.fileName)
        except Exception as err:
            raise LexiconError(err)
        busyWait(READING, 0)
        # Read in a list of lines, without showing any progress.
        # But then use the number of words read in to time progress updates
        lines = [ line.rstrip('\n') for line in f.readlines()
                                    if not line.startswith('#') ]
        self.numLines = len(lines)
        pcPerLine = 100 / (self.numLines * PHASES)   # NB result is a real
        pcDone = pcPerLine * self.numLines   # for the read itself
        # build dict of normalized forms in self.refs
        # and pronunciation-related dictionaries in self.prons and self.spells
        busyWait(NORMALIZING, int(pcDone))
        # we need the unPythonic indexed loop because we change line
        for i in range(len(lines)):
            line = lines[i]
            pcDone += pcPerLine
            if pcDone % INTERVAL < pcPerLine:
                busyWait(NORMALIZING, int(pcDone))
            tabpos = line.find('\t')
            if tabpos < 0:
                tabpos = len(line)
            normal = self.normalized(line[:tabpos], self.caseBlind,
                                     self.diacFilter)
            variants = line[tabpos+1:]
            line = line[:tabpos]
            # store pronunciations
            if variants:
                self.numProns += 1
                variants = variants.split(', ')
                self.numVars += len(variants)
                for variant in variants:
                    if variant not in self.prons[normal]:
                        self.prons[normal].append(variant)
                    self.spells[variant].append(line)
            lines[i] = line
            self.refs[normal].append(line)
        # build sorted list of normalized word forms for lookup
        busyWait(SORTING, int(pcDone))
        self.words = sorted(self.refs)   # includes only the keys
        pcDone += pcPerLine * self.numLines
        # build a separate dictionary of normalized anagram forms
        busyWait(HASHING, int(pcDone))
        for line in lines:
            pcDone += pcPerLine
            if pcDone % INTERVAL < pcPerLine:
                busyWait(HASHING, int(pcDone))
            hash = self.anagramHash(line)
            self.anags[hash].append(line)
        busyWait(DONE, 100)
        if f:
            f.close()

    def normalized(self, s, caseBlind, diacFilter):
        '''Apply needed transformations to ignore case and/or accents'''
        if caseBlind:
            s = s.casefold()
        if diacFilter:
            s = ud.normalize('NFKD', s)
            s = s.translate(self.diacFilter)
        return s

    def anagramHash(self, s):
        '''Create a unique hash for anagram purposes, ignoring
        order, letter-case, and all punctuation. Diacritics
        are ignored only if self.diacFilter is set'''
        # Unfortunately the \w class (Unicode 'word' characters)
        # includes the underscore, so _ must be special-cased
        s = s.replace('_', '')
        # Upper-case the string, remove all characters which are not \w,
        # and sort the result: thus all anagrammatic strings get the same hash
        return ''.join(sorted(re.sub(r'\W', '',
                        self.normalized(s, True, self.diacFilter))))

    def WNInfo(self, word, kinds):
        '''Return a list of WordNetInfo tuples of information about word'''

        def WNNormalized(word):
            '''Return a WordNet-normalized version of word: all lower-case and
            no apostrophes'''
            return word.lower().replace("'", '')

        def addName(dic, key, name):
            '''Create dic[key] as an empty list if necessary and append name
            to it if not already present'''
            if key not in dic:
                dic[key] = []
            lst = dic[key]
            if name not in lst:
                lst.append(name)

        def excludeName(kind, lemma):
            '''Return whether this lemma should be excluded because the kind
            is restricted to strict lemmas of the headword'''
            return ((WNKINDS[kind].EXTENT == WNE.HEAD) and
                    (lemma.name() not in heads))

        results = []
        heads = set()
        for cat in WNCATS.keys():
            heads.add(wnl.lemmatize(word, cat))
        levelKinds = {}
        for level in (WNL.SYN, WNL.LEM, WNL.SUB):
            levelKinds[level] = [kind for kind in kinds
                                 if WNKINDS[kind].LEVEL == level]
        for synset in wn.synsets(WNNormalized(word), lang=self.language):
            synset_base, synset_pos, synset_number = synset.name().split('.')
            info = OrderedDict()
            # relations defined as synset.RELATION
            for kind in levelKinds[WNL.SYN]:
                info[kind] = getattr(synset, kind)()
            for lemma in synset.lemmas(lang=self.language):
                # synonyms are a special case: they are simply synset.lemma
                if 'synonyms' in kinds:
                    addName(info, 'synonyms', lemma.name())
                # relations defined as synset.lemma.RELATION
                for kind in levelKinds[WNL.LEM]:
                    if excludeName(kind, lemma):
                        continue
                    for nym in getattr(lemma, kind)():
                        try:
                            name = nym.name()
                        except AttributeError:
                            name = nym
                        addName(info, kind, name)
            # relations defined as synset.RELATION.lemma
            for kind in levelKinds[WNL.SUB]:
                for item in getattr(synset, kind)():
                    for lemma in item.lemmas(lang=self.language):
                        if excludeName(kind, lemma):
                            continue
                        name = lemma.name()
                        addName(info, kind, name)
            # make the output be of the required type
            for key in info.keys():
                if ((WNKINDS[key].OUTTYPE == str) and
                    (type(info[key]) != str)):
                    info[key] = ', '.join(info[key])
                elif ((WNKINDS[key].OUTTYPE == list) and
                    (type(info[key]) != list)):
                    info[key] = [ info[key] ]
            results.append(WordNetInfo(synset_base, WNCATS[synset_pos],
                            synset_number, info))
        return results

### Client-facing definitions follow ###

    @property
    def hasWordNet(self):
        return WORDNET

    @staticmethod
    def languages():
        '''Return a list of the ISO 639-3 codes for the WordNet-supported
        languages.'''
        return wn.langs() if WORDNET else []

    def getLanguage(self):
        '''Return the language for WordNet lookups'''
        return self.language

    def setLanguage(self, language):
        '''Set the language for WordNet lookups if it is supported'''
        if language in self.languages():
            self.language = language

    def stats(self):
        '''Return the number of various things as a tuple'''
        return self.numLines, self.numProns, self.numVars

    def contains(self, word):
        '''Return a list of matching spellings: its main purpose
        is to return a list of different headwords after normalization'''
        word = self.normalized(word, self.caseBlind, self.diacFilter)
        i = bisect.bisect_left(self.words, word)
        if i != len(self.words) and self.words[i] == word:
            return self.refs[word]
        else:
            return []

    def prefixed(self, prefix):
        '''Return a list of words with prefix'''
        prefix = self.normalized(prefix, self.caseBlind, self.diacFilter)
        return [ ' '.join(self.refs[w]) for w in self.words
                    if w.startswith(prefix) ]

    def suffixed(self, suffix):
        '''Return a list of words with suffix'''
        suffix = self.normalized(suffix, self.caseBlind, self.diacFilter)
        return [ ' '.join(self.refs[w]) for w in self.words
                    if w.endswith(suffix) ]

    def definitions(self, word):
        '''Return a list of WordNetInfo tuples for each sense of word.
        Each tuple will contain the definition and list, possibly empty,
        of examples.'''
        if WORDNET:
            word = self.normalized(word, False, self.diacFilter)
            return self.WNInfo(word, [key for key, val in WNKINDS.items()
                                     if val.GROUP == WNG.DEFS])
        else:
            return self.contains(word)

    def related(self, word):
        '''Return a list of WordNetInfo tuples for each sense of word.
        Each tuple will contain possibly empty lists of synonyms etc.'''
        if WORDNET:
            word = self.normalized(word, False, self.diacFilter)
            return self.WNInfo(word, [key for key, val in WNKINDS.items()
                                      if val.GROUP == WNG.RELS])
        else:
            return []

    def pronunciations(self, word):
        '''Return a list of pronunciations of word'''
        return self.prons.get(self.normalized(word, self.caseBlind,
                              self.diacFilter))

    def homophones(self, word):
        '''Return a list of homophones of the word'''
        results = []
        prons = self.prons.get(self.normalized(word, self.caseBlind,
                                               self.diacFilter))
        if prons:
            for pron in prons:
                for variant in self.spells[pron]:
                    if variant not in results:
                        results.append(variant)
        return results

    def anagrams(self, word):
        '''Return a list of anagrams of word, ignoring case and punctuation'''
        return self.anags.get(self.anagramHash(word), [])

    def regex(self, pattern):
        '''Return list of matching words'''
        # we can normalize case but we can't do anything about diacritics
        flags = re.IGNORECASE if self.caseBlind else 0
        try:
            pattern = re.compile(pattern, flags)
        except Exception as err:   # probably an invalid regex
            raise LexiconError(str(err))
        matches = []
        for word in self.words:
            m = pattern.search(word)
            if m:
                matches.extend(self.refs[word])
        return matches

if __name__ == '__main__':
    print('This module is intended to be imported rather than run standalone')