@@ -145,3 +145,167 @@ def combine_stress(stresses: Union[List[str], Set[str]]) -> str:
145145def unspace_punct (in_str : str ):
146146 """Attempt to remove spaces before punctuation."""
147147 return re .sub (r' +([.?!;:])' , r'\1' , in_str )
148+
149+
150+ def tixonov (from_cache = True ):
151+ cache_path = f'{ FST_DIR } /Tixonov_dict.pkl'
152+ if from_cache :
153+
154+ tix_dict = defaultdict (list )
155+ with open (f'{ RSRC_DIR } /src/Tixonov.txt' ) as f :
156+ for line in f :
157+ parse = line .strip ().replace ('`' , '' ).split ('/' )
158+ parse = tuple ([e for e in parse if e ])
159+ lemma = '' .join (parse )
160+ noncyr = re .sub (r'[a-яё\-]' , '' , lemma , flags = re .I )
161+ if noncyr :
162+ print ('Non-cyrillic characters:' , lemma , noncyr , file = stderr )
163+ # TODO verify and remove duplicates
164+ # if lemma in tix_dict:
165+ # print(f'\t{lemma} already in tix_dict:',
166+ # f'old: "{tix_dict[lemma]}"',
167+ # f'new: "{parse}"', file=stderr)
168+ if parse not in tix_dict [lemma ]:
169+ tix_dict [lemma ].append (parse )
170+
171+ for lemma , parses in tix_dict .items ():
172+ tix_dict [lemma ] = sorted (parses )
173+
174+ return tix_dict
175+
176+
177+ def tixonov_morph_count ():
178+ cache_path = f'{ FST_DIR } /Tix_morph_count_dict.pkl'
179+ tix_dict = tixonov ()
180+
181+ morph_count_dict = {}
182+ for lemma , parses in tix_dict .items ():
183+ morph_count_dict [lemma ] = mean (len (p ) for p in parses )
184+ return morph_count_dict
185+
186+
187+ def lexmin ():
188+ cache_path = f'FST_DIR}/ lexmin_dict .pkl '
189+ lexmin_dict = {}
190+ for level in ['A1' , 'A2' , 'B1' , 'B2' ]:
191+ with open (f'{ RSRC_DIR } /src/lexmin_{ level } .txt' ) as f :
192+ for lemma in f :
193+ lemma = lemma .strip ()
194+ if lemma :
195+ # TODO verify and remove duplicates
196+ # if lemma in lexmin_dict:
197+ # print(f'\t{lemma} ({level}) already in lexmin',
198+ # lexmin_dict[lemma], file=stderr)
199+ lexmin_dict [lemma ] = level
200+ return lexmin_dict
201+
202+
203+ def kelly ():
204+ cache_path = f'FST_DIR}/ kelly_dict .pkl '
205+ kelly_dict = {}
206+ with open (f'{ RSRC_DIR } /src/KellyProject_Russian_M3.txt' ) as f :
207+ for line in f :
208+ level , freq , lemma = line .strip ().split ('\t ' )
209+ # TODO verify and remove duplicates
210+ # if lemma in kelly_dict:
211+ # print(f'{lemma} ({level}) already in kelly_dict',
212+ # kelly_dict[lemma], file=stderr)
213+ kelly_dict [lemma ] = level
214+ return kelly_dict
215+
216+
217+ def rnc_freq ():
218+ """Token frequency data from Russian National Corpus 1-gram data.
219+ taken from: http://ruscorpora.ru/corpora-freq.html
220+ """
221+ cache_path = f'FST_DIR}/ RNC_tok_freq_dict .pkl '
222+ RNC_tok_freq_dict = {}
223+ with open (f'{ RSRC_DIR } /src/RNC_1grams-3.txt' ) as f :
224+ for line in f :
225+ tok_freq , tok = line .split ()
226+ if tok in RNC_tok_freq_dict :
227+ print (f'\t { tok } already in RNC_tok_freq_dict '
228+ f'({ tok_freq } vs { RNC_tok_freq_dict [tok ]} )' , file = stderr )
229+ continue
230+ RNC_tok_freq_dict [tok ] = float (tok_freq )
231+ return RNC_tok_freq_dict
232+
233+
234+ def rnc_freq_rank ():
235+ """Token frequency data from Russian National Corpus 1-gram data.
236+ taken from: http://ruscorpora.ru/corpora-freq.html
237+ """
238+ cache_path = f'FST_DIR}/ RNC_tok_freq_rank_dict .pkl '
239+ RNC_tok_freq_rank_dict = {}
240+ with open (f'{ RSRC_DIR } /src/RNC_1grams-3.txt' ) as f :
241+ rank = 0
242+ last_freq = None
243+ for i , line in enumerate (f , start = 1 ):
244+ tok_freq , tok = line .split ()
245+ if tok_freq != last_freq :
246+ rank = i
247+ if tok in RNC_tok_freq_rank_dict :
248+ print (f'\t { tok } already in RNC_tok_freq_rank_dict '
249+ f'({ rank } vs { RNC_tok_freq_rank_dict [tok ]} )' , file = stderr )
250+ continue
251+ RNC_tok_freq_rank_dict [tok ] = rank
252+ return RNC_tok_freq_rank_dict
253+
254+
255+ def sharoff ():
256+ # Lemma freq data from Serge Sharoff.
257+ # Taken from: http://www.artint.ru/projects/frqlist/frqlist-en.php
258+
259+ # TODO what about http://dict.ruslang.ru/freq.php ?
260+
261+ cache_path = f'FST_DIR}/ Sharoff_lem_freq_dict .pkl '
262+
263+ Sharoff_lem_freq_dict = {}
264+ with open (f'{ RSRC_DIR } /src/Sharoff_lemmaFreq.txt' ) as f :
265+ for line in f :
266+ line_num , freq , lemma , pos = line .split ()
267+ if lemma in Sharoff_lem_freq_dict :
268+ print (f'{ lemma } already in Sharoff_lem_freq_dict. '
269+ f'old: { Sharoff_lem_freq_dict [lemma ]} '
270+ f'new: { (freq , line_num , pos )} ' , file = stderr )
271+ continue
272+ Sharoff_lem_freq_dict [lemma ] = float (freq )
273+ return Sharoff_lem_freq_dict
274+
275+
276+ def sharoff_rank ():
277+ # Lemma freq data from Serge Sharoff.
278+ # Taken from: http://www.artint.ru/projects/frqlist/frqlist-en.php
279+
280+ # TODO what about http://dict.ruslang.ru/freq.php ?
281+
282+ cache_path = f'FST_DIR}/ Sharoff_lem_freq_rank_dict .pkl '
283+
284+ Sharoff_lem_freq_rank_dict = {}
285+ with open (f'{ RSRC_DIR } /src/Sharoff_lemmaFreq.txt' ) as f :
286+ rank = None
287+ last_freq = None
288+ for i , line in enumerate (f , start = 1 ):
289+ line_num , freq , lemma , pos = line .split ()
290+ if freq != last_freq :
291+ rank = i
292+ if lemma in Sharoff_lem_freq_rank_dict :
293+ print (f'{ lemma } already in Sharoff_lem_freq_rank_dict. '
294+ f'old: { Sharoff_lem_freq_rank_dict [lemma ]} '
295+ f'new: { (rank , line_num , pos )} ' , file = stderr )
296+ continue
297+ Sharoff_lem_freq_rank_dict [lemma ] = rank
298+ return Sharoff_lem_freq_rank_dict
299+
300+
301+ def cache_rsrc (resource , fname ) - -> bool :
302+ """Attempt to cache (pickle) resource to `fname`."""
303+ with open (fname , 'w' ) as f :
304+ pickle .dump (resource )
305+
306+
307+ def uncache_rsrc (fname ):
308+ """Attempt to uncache (unpickle) resource from `fname`."""
309+ with open (fname ) as f :
310+ resource = pickle .load (f )
311+ return resource
0 commit comments