Skip to content

Commit fd604aa

Browse files
committed
fixed normalize-transcribed sorting textonyms incorrectly
1 parent 845d51d commit fd604aa

File tree

1 file changed

+70
-8
lines changed

1 file changed

+70
-8
lines changed

scripts/normalize-transcribed.py

Lines changed: 70 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import sys
22
import os
33
import argparse
4+
import yaml
45
from collections import defaultdict
56

67
def print_error(message):
78
print(message, file=sys.stderr)
89

10+
# ---------- Argument Parsing ----------
911
def parse_args():
1012
parser = argparse.ArgumentParser(
1113
description="Normalizes the frequencies in a dictionary with transcriptions."
@@ -14,13 +16,44 @@ def parse_args():
1416
"word_list",
1517
help="Path to the word list file (e.g., WORD-LIST.txt)"
1618
)
19+
parser.add_argument(
20+
"layout_yaml",
21+
help="Path to the YAML file containing the layout definitions (UTF-8 encoded)"
22+
)
1723
return parser.parse_args()
1824

25+
# ---------- File Validation ----------
1926
def validate_file(file_path):
2027
if not os.path.isfile(file_path):
21-
print_error(f'Failure! Could not find word list file "{file_path}".')
28+
print_error(f'Failure! Could not find file "{file_path}".')
2229
sys.exit(2)
2330

31+
# ---------- YAML Layout Loading ----------
32+
def load_layout(yaml_path):
33+
with open(yaml_path, encoding='utf-8') as f:
34+
data = yaml.safe_load(f)
35+
36+
if "layout" not in data or not isinstance(data["layout"], list):
37+
print_error("Error: YAML file must contain a 'layout' key with a list of lists.")
38+
sys.exit(4)
39+
40+
layout_dict = {}
41+
seen = set()
42+
43+
for index, group in enumerate(data["layout"]):
44+
if not isinstance(group, list) or len(group) == 0:
45+
print_error(f"Error: Layout entry {index} must be a non-empty list of strings.")
46+
sys.exit(4)
47+
for symbol in group:
48+
if symbol in seen:
49+
print_error(f"Error: Duplicate symbol '{symbol}' found in layout. Aborting.")
50+
sys.exit(4)
51+
seen.add(symbol)
52+
layout_dict[symbol] = index
53+
54+
return layout_dict
55+
56+
# ---------- Word List Loading ----------
2457
def load_entries(file_path):
2558
with open(file_path, encoding='utf-8') as f:
2659
lines = [line.strip() for line in f if line.strip()]
@@ -32,7 +65,7 @@ def load_entries(file_path):
3265
print_error(f"Malformed line {line_num}: '{line}' (expected at least 2 tab-separated fields)")
3366
sys.exit(3)
3467

35-
chinese, latin = parts[:2]
68+
native, latin = parts[:2]
3669
number = None
3770
if len(parts) > 2:
3871
try:
@@ -41,16 +74,40 @@ def load_entries(file_path):
4174
print_error(f"Malformed line {line_num}: '{line}' (third field must be an integer if present)")
4275
sys.exit(3)
4376

44-
entries.append({'chinese': chinese, 'latin': latin, 'number': number})
77+
entries.append({'native': native, 'latin': latin, 'number': number})
4578

4679
return entries
4780

48-
def group_entries(entries):
81+
# ---------- Grouping by Layout Index Pattern ----------
82+
def group_entries(entries, layout_dict):
4983
groups = defaultdict(list)
84+
85+
# Sort symbols by length (descending) for multi-letter matching (e.g., 'Zh' before 'Z')
86+
sorted_symbols = sorted(layout_dict.keys(), key=len, reverse=True)
87+
5088
for entry in entries:
51-
groups[entry['latin']].append(entry)
89+
latin = entry['latin']
90+
i = 0
91+
index_seq = []
92+
93+
while i < len(latin):
94+
matched = False
95+
for symbol in sorted_symbols:
96+
if latin.startswith(symbol, i):
97+
index_seq.append(str(layout_dict[symbol]))
98+
i += len(symbol)
99+
matched = True
100+
break
101+
if not matched:
102+
print_error(f"Error: Unknown symbol in Latin string '{latin}' near '{latin[i:]}'")
103+
sys.exit(5)
104+
105+
key = ''.join(index_seq)
106+
groups[key].append(entry)
107+
52108
return groups
53109

110+
# ---------- Frequency Normalization ----------
54111
def normalize_frequencies(groups):
55112
sorted_entries = []
56113
for group in groups.values():
@@ -67,18 +124,23 @@ def normalize_frequencies(groups):
67124

68125
return sorted_entries
69126

127+
# ---------- Output ----------
70128
def print_entries(entries):
71129
for e in entries:
72-
parts = [e['chinese'], e['latin']]
130+
parts = [e['native'], e['latin']]
73131
if e['number'] is not None:
74-
parts.append(e['number'])
132+
parts.append(str(e['number']))
75133
print('\t'.join(parts))
76134

135+
# ---------- Main ----------
77136
def main():
78137
args = parse_args()
79138
validate_file(args.word_list)
139+
validate_file(args.layout_yaml)
140+
141+
layout_dict = load_layout(args.layout_yaml)
80142
entries = load_entries(args.word_list)
81-
groups = group_entries(entries)
143+
groups = group_entries(entries, layout_dict)
82144
sorted_entries = normalize_frequencies(groups)
83145
print_entries(sorted_entries)
84146

0 commit comments

Comments
 (0)