-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_dict.py
More file actions
52 lines (40 loc) · 1.55 KB
/
build_dict.py
File metadata and controls
52 lines (40 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import csv
import json
import logging
import os
import re
import util
logging.basicConfig(level="INFO")
input_dir = "./data/elkmovie_hsk30/"
output_file = "./assets/hsk30_words.json"
wubi_file = "./data/wubi86_stripped.txt"
info = []
wubi_dictionary = None
with open(wubi_file, "r", encoding="utf-8-sig") as wf:
wubi_dictionary = wf.read()
input_files = os.listdir(input_dir)
for input_file in input_files:
hsk_level = re.search(r"[1-7]", input_file).group() # use 7 for 7 through 9
with open("{}{}".format(input_dir, input_file), "r", encoding="utf-8-sig") as f:
lines = f.readlines()
for line in lines:
match = re.match(r"(\d+) ([^(|\s]+)", line)
index = match.group(1)
word = match.group(2)
logging.info("Getting info for '{}'.".format(word))
word_info = util.get_info(word)
word_info["hsk_level"] = hsk_level
# find the wubi strokes for each character
wubi = []
for char in word:
# this pops up a couple times, ignore it (for now)
try:
keys = re.search(r"^{}\t([a-z]+)$".format(char), wubi_dictionary, re.MULTILINE).group(1)
except:
logging.warning("Unable to find wubi strokes for '{}', skipping.".format(char))
next
wubi.append(keys)
word_info["wubi"] = " ".join(wubi)
info.append(word_info)
with open(output_file, "w") as f:
f.write(json.dumps(info, ensure_ascii=False))