44files. Compatible with NLTK's CMUDictCorpusReader.
55"""
66
7+ import atexit
78import re
8- import sys
9- from collections import defaultdict
109from contextlib import ExitStack
11- import atexit
12-
13- if sys .version_info >= (3 , 9 ):
14- from importlib import metadata , resources
15- else :
16- import importlib_metadata as metadata
17- import importlib_resources as resources
10+ from importlib import metadata , resources
11+ from typing import IO , Dict , List , Optional , Tuple
1812
1913__version__ = metadata .version (__name__ )
2014
2822atexit .register (file_manager .close )
2923
3024
31- def _stream (resource_name ) :
32- stream = resources .files (__name__ ).joinpath (resource_name ).open ("rb" )
25+ def _stream (resource_name : str ) -> IO [ bytes ] :
26+ stream : IO [ bytes ] = resources .files (__name__ ).joinpath (resource_name ).open ("rb" )
3327 return stream
3428
3529
36- def _string (resource_name ) :
30+ def _string (resource_name : str ) -> str :
3731 with resources .files (__name__ ).joinpath (resource_name ).open () as file :
3832 return file .read ()
3933
4034
41- def _entries (stream , comment_string = None ):
35+ def _entries (
36+ stream : IO [bytes ], comment_string : Optional [str ] = None
37+ ) -> List [Tuple [str , List [str ]]]:
4238 cmudict_entries = []
4339 for line in stream :
4440 parts = []
@@ -52,113 +48,117 @@ def _entries(stream, comment_string=None):
5248
5349
5450# pylint: disable-next=redefined-builtin
55- def dict ():
51+ def dict () -> Dict [ str , List [ List [ str ]]] :
5652 """
5753 Compatibility with NLTK.
5854 Returns the cmudict lexicon as a dictionary, whose keys are
5955 lowercase words and whose values are lists of pronunciations.
6056 """
61- default = defaultdict ( list )
57+ default : Dict [ str , List [ List [ str ]]] = {}
6258 for key , value in entries ():
59+ if key not in default :
60+ default [key ] = []
6361 default [key ].append (value )
6462 return default
6563
6664
67- def dict_stream ():
65+ def dict_stream () -> IO [ bytes ] :
6866 """Return a readable file-like object of the cmudict.dict file."""
69- stream = _stream (CMUDICT_DICT )
67+ stream : IO [ bytes ] = _stream (CMUDICT_DICT )
7068 return stream
7169
7270
73- def dict_string ():
71+ def dict_string () -> str :
7472 """Return the contents of cmudict.dict as a string."""
7573 string = _string (CMUDICT_DICT )
7674 return string
7775
7876
79- def license_string ():
77+ def license_string () -> str :
8078 """Return the contents of LICENSE as a string."""
8179 string = _string (CMUDICT_LICENSE )
8280 return string
8381
8482
85- def phones ():
83+ def phones () -> List [ Tuple [ str , List [ str ]]] :
8684 """Return a list of phones used in the main dict."""
87- cmu_phones = []
85+ cmu_phones : List [ Tuple [ str , List [ str ]]] = []
8886 for line in phones_stream ():
8987 parts = line .decode ("utf-8" ).strip ().split ()
9088 cmu_phones .append ((parts [0 ], parts [1 :]))
9189 return cmu_phones
9290
9391
94- def phones_stream ():
92+ def phones_stream () -> IO [ bytes ] :
9593 """Return a readable file-like object of the cmudict.phones file."""
96- p_stream = _stream (CMUDICT_PHONES )
94+ p_stream : IO [ bytes ] = _stream (CMUDICT_PHONES )
9795 return p_stream
9896
9997
100- def phones_string ():
98+ def phones_string () -> str :
10199 """Return the contents of cmudict.phones as a string."""
102100 string = _string (CMUDICT_PHONES )
103101 return string
104102
105103
106- def symbols ():
104+ def symbols () -> List [ str ] :
107105 """Return a list of symbols."""
108- cmu_symbols = []
106+ cmu_symbols : List [ str ] = []
109107 for line in symbols_stream ():
110108 cmu_symbols .append (line .decode ("utf-8" ).strip ())
111109 return cmu_symbols
112110
113111
114- def symbols_stream ():
112+ def symbols_stream () -> IO [ bytes ] :
115113 """Return a readable file-like object of the cmudict.symbols file."""
116- stream = _stream (CMUDICT_SYMBOLS )
114+ stream : IO [ bytes ] = _stream (CMUDICT_SYMBOLS )
117115 return stream
118116
119117
120- def symbols_string ():
118+ def symbols_string () -> str :
121119 """Return the contents of cmudict.symbols as a string."""
122120 string = _string (CMUDICT_SYMBOLS )
123121 return string
124122
125123
126124# pylint: disable-next=invalid-name
127- def vp ():
125+ def vp () -> Dict [ str , List [ List [ str ]]] :
128126 """Return a list of punctuation pronounciations."""
129- cmu_vp = defaultdict ( list )
127+ cmu_vp : Dict [ str , List [ List [ str ]]] = {}
130128 with vp_stream () as stream :
131129 for key , value in _entries (stream ):
130+ if not key in cmu_vp :
131+ cmu_vp [key ] = []
132132 cmu_vp [key ].append (value )
133133 return cmu_vp
134134
135135
136- def vp_stream ():
136+ def vp_stream () -> IO [ bytes ] :
137137 """Return a readable file-like object of the cmudict.vp file."""
138- stream = _stream (CMUDICT_VP )
138+ stream : IO [ bytes ] = _stream (CMUDICT_VP )
139139 return stream
140140
141141
142- def vp_string ():
142+ def vp_string () -> str :
143143 """Return the contents of cmudict.vp as a string."""
144144 string = _string (CMUDICT_VP )
145145 return string
146146
147147
148148# The .entries(), .raw(), and .words() functions
149149# maintain compatability with NTLK.
150- def entries ():
150+ def entries () -> List [ Tuple [ str , List [ str ]]] :
151151 """
152152 Compatibility with NLTK.
153153 Returns the cmudict lexicon as a list of entries
154154 containing (word, transcriptions) tuples.
155155 """
156156 with dict_stream () as stream :
157- cmu_entries = _entries (stream , "#" )
157+ cmu_entries : List [ Tuple [ str , List [ str ]]] = _entries (stream , "#" )
158158 return cmu_entries
159159
160160
161- def raw ():
161+ def raw () -> str :
162162 """
163163 Compatibility with NLTK.
164164 Returns the cmudict lexicon as a raw string.
@@ -167,7 +167,7 @@ def raw():
167167 return string
168168
169169
170- def words ():
170+ def words () -> List [ str ] :
171171 """
172172 Compatibility with NLTK.
173173 Returns a list of all words defined in the cmudict lexicon.
0 commit comments