Skip to content

Commit c5d6ef6

Browse files
committed
Merge split.csv into 小韻表.tsv
Also corrected some outdated 字頭s in 小韻表.tsv.
1 parent 9e726be commit c5d6ef6

File tree

3 files changed

+3933
-3954
lines changed

3 files changed

+3933
-3954
lines changed

build.py

Lines changed: 48 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -12,39 +12,29 @@ class 小韻Row:
1212
音韻地位: str
1313

1414

15-
def load_小韻表() -> dict[str, 小韻Row]:
16-
小韻_data: dict[str, 小韻Row] = {}
15+
def load_小韻表() -> tuple[
16+
dict[str, 小韻Row], dict[str, list[str]], dict[str, list[str]]
17+
]:
18+
小韻_data = dict[str, 小韻Row]()
19+
細分號_by_原書小韻 = dict[str, list[str]]()
20+
細分轄字_by_小韻 = dict[str, list[str]]()
1721
with open('src/小韻表.tsv') as fin:
1822
header = next(fin)
1923
assert header.rstrip('\n').split('\t') == [
2024
'小韻號',
2125
'首字',
2226
'反切',
2327
'音韻地位',
28+
'細分轄字',
2429
], repr(header)
2530
for line in fin:
26-
row = line.rstrip('\n').split('\t')
27-
小韻_data[row[0]] = 小韻Row(*row)
28-
return 小韻_data
29-
30-
31-
def load_小韻細分(
32-
小韻_data: dict[str, 小韻Row],
33-
) -> tuple[dict[str, list[str]], dict[str, list[str]]]:
34-
has_細分: dict[str, list[str]] = {}
35-
小韻細分_data: dict[str, list[str]] = {}
36-
with open('src/split.csv') as fin:
37-
next(fin)
38-
for row in csv.reader(fin):
39-
小韻號 = row[0]
40-
assert 小韻號[-1].isalpha()
41-
反切 = row[1]
42-
assert 小韻_data[小韻號].反切 == 反切, (
43-
f'反切 mismatch in 小韻 #{小韻號}, 小韻_data: {小韻_data[小韻號][2]}, 小韻細分_data: {反切}'
44-
)
45-
has_細分.setdefault(小韻號[:-1], []).append(小韻號[-1])
46-
小韻細分_data[小韻號] = row
47-
return has_細分, 小韻細分_data
31+
小韻號, 首字, 反切, 音韻地位, 細分轄字 = line.rstrip('\n').split('\t')
32+
小韻_data[小韻號] = 小韻Row(小韻號, 首字, 反切, 音韻地位)
33+
if 小韻號[-1].isalpha():
34+
原書小韻號 = 小韻號[:-1]
35+
細分號_by_原書小韻.setdefault(原書小韻號, []).append(小韻號[-1])
36+
細分轄字_by_小韻[小韻號] = 細分轄字
37+
return 小韻_data, 細分號_by_原書小韻, 細分轄字_by_小韻
4838

4939

5040
@dataclass
@@ -114,15 +104,15 @@ class 廣韻Row:
114104

115105

116106
def main():
117-
小韻_data = load_小韻表()
118-
has_細分, 小韻細分_data = load_小韻細分(小韻_data)
107+
小韻_data, 細分號_by_原書小韻, 細分轄字_by_小韻 = load_小韻表()
119108
字序_data = load_字序表()
120109
patches = load_patches()
121110

122-
小韻細分_coverage: dict[str, set[str]] = {}
123-
patch_coverage = set()
111+
小韻號_seen = set[str]()
112+
小韻細分_coverage = dict[str, set[str]]()
113+
patch_coverage = set[tuple[str, str]]()
124114

125-
poem_data: dict[tuple[str, str], dict[str, str]] = {}
115+
poem_data = dict[tuple[str, str], dict[str, str]]()
126116
with open('src/廣韻(20170209).csv') as fin:
127117
for row in csv.DictReader(fin):
128118
key = (row['小韻序'], row['小韻內字序'])
@@ -134,7 +124,7 @@ def main():
134124
poem_小韻內字序 = 字序_data[字序_key].poem_小韻內字序
135125
if not poem_小韻內字序:
136126
poem_反切 = poem_data[(原書小韻號, '1')]['廣韻反切(覈校後)']
137-
字頭 = ''
127+
含原貌字頭 = ''
138128
釋義 = ''
139129
釋義參照 = ''
140130
else:
@@ -149,7 +139,7 @@ def main():
149139
字頭覈校說明,
150140
poem_反切,
151141
字頭原貌,
152-
字頭,
142+
含原貌字頭,
153143
釋義,
154144
釋義補充,
155145
韻目原貌,
@@ -166,7 +156,7 @@ def main():
166156
)
167157
)
168158
if 字頭覈校說明 == '校':
169-
字頭 = f'[{字頭原貌}/{字頭}]'
159+
含原貌字頭 = f'[{字頭原貌}/{含原貌字頭}]'
170160
if not 釋義:
171161
釋義參照 = '下'
172162
elif 釋義補充:
@@ -177,8 +167,8 @@ def main():
177167
# 修正
178168
字頭說明 = ''
179169
if (patch := patches.get(字序_key)) is not None:
180-
assert patch.原字頭 == 字頭, (
181-
f'patching 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 字 is "{字頭}"'
170+
assert patch.原字頭 == 含原貌字頭, (
171+
f'patching 小韻 #{原書小韻號}/{小韻字號} 字 "{patch.原字頭}", but the actual 字 is "{含原貌字頭}"'
182172
)
183173
patch_coverage.add(字序_key)
184174
assert patch.校正字頭, (
@@ -189,10 +179,10 @@ def main():
189179
f'invalid 校正字頭: "{patch.校正字頭}"'
190180
)
191181
if '~' in patch.校正字頭:
192-
assert not 字頭.startswith('['), (
193-
f'cannot use "~" in 校正字頭 when 字頭 contains correction: "{字頭}"'
182+
assert not 含原貌字頭.startswith('['), (
183+
f'cannot use "~" in 校正字頭 when 字頭 contains correction: "{含原貌字頭}"'
194184
)
195-
字頭 = patch.校正字頭.replace('~', 字頭)
185+
含原貌字頭 = patch.校正字頭.replace('~', 含原貌字頭)
196186

197187
# 字頭說明 is an added field, thus it does not have an original value
198188
字頭說明 = patch.字頭說明
@@ -209,34 +199,43 @@ def main():
209199
)
210200
釋義參照 = patch.校正釋義參照
211201
elif 字序_data[字序_key].sbgy_字.endswith('/-]'):
212-
assert not 字頭.startswith('[')
213-
字頭 = f'[{字頭}/-]'
202+
assert not 含原貌字頭.startswith('[')
203+
含原貌字頭 = f'[{含原貌字頭}/-]'
214204

215205
字_check = 字序_data[字序_key].
216-
assert 字頭 == 字_check, (
217-
f'字頭 mismatch between 字序表 and (patched) 廣韻 data: "{字_check}" != "{字頭}" (小韻 {原書小韻號}/{小韻字號})'
206+
assert 含原貌字頭 == 字_check, (
207+
f'字頭 mismatch between 字序表 and (patched) 廣韻 data: "{字_check}" != "{含原貌字頭}" (小韻 {原書小韻號}/{小韻字號})'
218208
)
219-
if 字頭.startswith('['):
220-
字頭原貌, 字頭 = 字頭[1:-1].split('/')
209+
if 含原貌字頭.startswith('['):
210+
字頭原貌, 字頭 = 含原貌字頭[1:-1].split('/')
221211
字頭 = '' if 字頭 == '-' else 字頭
222212
字頭原貌 = '' if 字頭原貌 == '-' else 字頭原貌
223213
else:
214+
字頭 = 含原貌字頭
224215
字頭原貌 = ''
225216

226217
# 小韻號
227-
if 原書小韻號 in has_細分:
228-
for 細分 in has_細分[原書小韻號]:
218+
# NOTE 字頭 & 細分轄字 in 小韻表.tsv does not contain 字頭原貌 (yet)
219+
字頭或原貌 = 字頭 or 字頭原貌
220+
if 原書小韻號 in 細分號_by_原書小韻:
221+
for 細分 in 細分號_by_原書小韻[原書小韻號]:
229222
小韻號 = 原書小韻號 + 細分
230-
if 字頭 in 小韻細分_data[小韻號][2]:
231-
小韻細分_coverage.setdefault(小韻號, set()).add(字頭)
223+
if 字頭或原貌 in 細分轄字_by_小韻[小韻號]:
224+
小韻細分_coverage.setdefault(小韻號, set()).add(字頭或原貌)
232225
break
233226
else:
234227
raise ValueError(
235-
f'cannot determine 小韻細分 for {字頭} (小韻 #{原書小韻號})'
228+
f'cannot determine 小韻細分 for {字頭或原貌} (小韻 #{原書小韻號})'
236229
)
237230
else:
238231
小韻號 = 原書小韻號
239232

233+
if 小韻號 not in 小韻號_seen:
234+
assert 字頭或原貌 == 小韻_data[小韻號].首字, (
235+
f'首字 mismatch for 小韻 #{小韻號}: {字頭或原貌} != {小韻_data[小韻號].首字}'
236+
)
237+
小韻號_seen.add(小韻號)
238+
240239
# 音韻地位
241240
音韻地位 = 小韻_data[小韻號].音韻地位
242241

@@ -268,7 +267,7 @@ def main():
268267
)
269268

270269
for 小韻號, cov in 小韻細分_coverage.items():
271-
specified = set(小韻細分_data[小韻號][2])
270+
specified = set(細分轄字_by_小韻[小韻號])
272271
diff = specified - cov
273272
assert not diff, (
274273
f'字頭 listed in 小韻細分_data but not seen: {"".join(sorted(diff))} (小韻 #{小韻號})'

src/split.csv

Lines changed: 0 additions & 20 deletions
This file was deleted.

0 commit comments

Comments
 (0)