44files. Compatible with NLTK's CMUDictCorpusReader.
55"""
66
7+ import atexit
78import re
8- from collections import defaultdict
99from contextlib import ExitStack
10- import atexit
11-
1210from importlib import metadata , resources
11+ from typing import IO , Dict , List , Optional , Tuple
1312
1413__version__ = metadata .version (__name__ )
1514
2322atexit .register (file_manager .close )
2423
2524
26- def _stream (resource_name ) :
27- stream = resources .files (__name__ ).joinpath (resource_name ).open ("rb" )
25+ def _stream (resource_name : str ) -> IO [ bytes ] :
26+ stream : IO [ bytes ] = resources .files (__name__ ).joinpath (resource_name ).open ("rb" )
2827 return stream
2928
3029
31- def _string (resource_name ) :
30+ def _string (resource_name : str ) -> str :
3231 with resources .files (__name__ ).joinpath (resource_name ).open () as file :
3332 return file .read ()
3433
3534
36- def _entries (stream , comment_string = None ):
35+ def _entries (
36+ stream : IO [bytes ], comment_string : Optional [str ] = None
37+ ) -> List [Tuple [str , List [str ]]]:
3738 cmudict_entries = []
3839 for line in stream :
3940 parts = []
@@ -47,113 +48,117 @@ def _entries(stream, comment_string=None):
4748
4849
4950# pylint: disable-next=redefined-builtin
50- def dict ():
51+ def dict () -> Dict [ str , List [ List [ str ]]] :
5152 """
5253 Compatibility with NLTK.
5354 Returns the cmudict lexicon as a dictionary, whose keys are
5455 lowercase words and whose values are lists of pronunciations.
5556 """
56- default = defaultdict ( list )
57+ default : Dict [ str , List [ List [ str ]]] = {}
5758 for key , value in entries ():
59+ if key not in default :
60+ default [key ] = []
5861 default [key ].append (value )
5962 return default
6063
6164
62- def dict_stream ():
65+ def dict_stream () -> IO [ bytes ] :
6366 """Return a readable file-like object of the cmudict.dict file."""
64- stream = _stream (CMUDICT_DICT )
67+ stream : IO [ bytes ] = _stream (CMUDICT_DICT )
6568 return stream
6669
6770
68- def dict_string ():
71+ def dict_string () -> str :
6972 """Return the contents of cmudict.dict as a string."""
7073 string = _string (CMUDICT_DICT )
7174 return string
7275
7376
74- def license_string ():
77+ def license_string () -> str :
7578 """Return the contents of LICENSE as a string."""
7679 string = _string (CMUDICT_LICENSE )
7780 return string
7881
7982
80- def phones ():
83+ def phones () -> List [ Tuple [ str , List [ str ]]] :
8184 """Return a list of phones used in the main dict."""
82- cmu_phones = []
85+ cmu_phones : List [ Tuple [ str , List [ str ]]] = []
8386 for line in phones_stream ():
8487 parts = line .decode ("utf-8" ).strip ().split ()
8588 cmu_phones .append ((parts [0 ], parts [1 :]))
8689 return cmu_phones
8790
8891
89- def phones_stream ():
92+ def phones_stream () -> IO [ bytes ] :
9093 """Return a readable file-like object of the cmudict.phones file."""
91- p_stream = _stream (CMUDICT_PHONES )
94+ p_stream : IO [ bytes ] = _stream (CMUDICT_PHONES )
9295 return p_stream
9396
9497
95- def phones_string ():
98+ def phones_string () -> str :
9699 """Return the contents of cmudict.phones as a string."""
97100 string = _string (CMUDICT_PHONES )
98101 return string
99102
100103
101- def symbols ():
104+ def symbols () -> List [ str ] :
102105 """Return a list of symbols."""
103- cmu_symbols = []
106+ cmu_symbols : List [ str ] = []
104107 for line in symbols_stream ():
105108 cmu_symbols .append (line .decode ("utf-8" ).strip ())
106109 return cmu_symbols
107110
108111
109- def symbols_stream ():
112+ def symbols_stream () -> IO [ bytes ] :
110113 """Return a readable file-like object of the cmudict.symbols file."""
111- stream = _stream (CMUDICT_SYMBOLS )
114+ stream : IO [ bytes ] = _stream (CMUDICT_SYMBOLS )
112115 return stream
113116
114117
115- def symbols_string ():
118+ def symbols_string () -> str :
116119 """Return the contents of cmudict.symbols as a string."""
117120 string = _string (CMUDICT_SYMBOLS )
118121 return string
119122
120123
121124# pylint: disable-next=invalid-name
122- def vp ():
125+ def vp () -> Dict [ str , List [ List [ str ]]] :
123126 """Return a list of punctuation pronounciations."""
124- cmu_vp = defaultdict ( list )
127+ cmu_vp : Dict [ str , List [ List [ str ]]] = {}
125128 with vp_stream () as stream :
126129 for key , value in _entries (stream ):
130+ if not key in cmu_vp :
131+ cmu_vp [key ] = []
127132 cmu_vp [key ].append (value )
128133 return cmu_vp
129134
130135
131- def vp_stream ():
136+ def vp_stream () -> IO [ bytes ] :
132137 """Return a readable file-like object of the cmudict.vp file."""
133- stream = _stream (CMUDICT_VP )
138+ stream : IO [ bytes ] = _stream (CMUDICT_VP )
134139 return stream
135140
136141
137- def vp_string ():
142+ def vp_string () -> str :
138143 """Return the contents of cmudict.vp as a string."""
139144 string = _string (CMUDICT_VP )
140145 return string
141146
142147
143148# The .entries(), .raw(), and .words() functions
144149# maintain compatability with NTLK.
145- def entries ():
150+ def entries () -> List [ Tuple [ str , List [ str ]]] :
146151 """
147152 Compatibility with NLTK.
148153 Returns the cmudict lexicon as a list of entries
149154 containing (word, transcriptions) tuples.
150155 """
151156 with dict_stream () as stream :
152- cmu_entries = _entries (stream , "#" )
157+ cmu_entries : List [ Tuple [ str , List [ str ]]] = _entries (stream , "#" )
153158 return cmu_entries
154159
155160
156- def raw ():
161+ def raw () -> str :
157162 """
158163 Compatibility with NLTK.
159164 Returns the cmudict lexicon as a raw string.
@@ -162,7 +167,7 @@ def raw():
162167 return string
163168
164169
165- def words ():
170+ def words () -> List [ str ] :
166171 """
167172 Compatibility with NLTK.
168173 Returns a list of all words defined in the cmudict lexicon.
0 commit comments