11import sys
22import os
33import argparse
4+ import yaml
45from collections import defaultdict
56
67def print_error (message ):
78 print (message , file = sys .stderr )
89
10+ # ---------- Argument Parsing ----------
911def parse_args ():
1012 parser = argparse .ArgumentParser (
1113 description = "Normalizes the frequencies in a dictionary with transcriptions."
@@ -14,13 +16,44 @@ def parse_args():
1416 "word_list" ,
1517 help = "Path to the word list file (e.g., WORD-LIST.txt)"
1618 )
19+ parser .add_argument (
20+ "layout_yaml" ,
21+ help = "Path to the YAML file containing the layout definitions (UTF-8 encoded)"
22+ )
1723 return parser .parse_args ()
1824
25+ # ---------- File Validation ----------
1926def validate_file (file_path ):
2027 if not os .path .isfile (file_path ):
21- print_error (f'Failure! Could not find word list file "{ file_path } ".' )
28+ print_error (f'Failure! Could not find file "{ file_path } ".' )
2229 sys .exit (2 )
2330
31+ # ---------- YAML Layout Loading ----------
32+ def load_layout (yaml_path ):
33+ with open (yaml_path , encoding = 'utf-8' ) as f :
34+ data = yaml .safe_load (f )
35+
36+ if "layout" not in data or not isinstance (data ["layout" ], list ):
37+ print_error ("Error: YAML file must contain a 'layout' key with a list of lists." )
38+ sys .exit (4 )
39+
40+ layout_dict = {}
41+ seen = set ()
42+
43+ for index , group in enumerate (data ["layout" ]):
44+ if not isinstance (group , list ) or len (group ) == 0 :
45+ print_error (f"Error: Layout entry { index } must be a non-empty list of strings." )
46+ sys .exit (4 )
47+ for symbol in group :
48+ if symbol in seen :
49+ print_error (f"Error: Duplicate symbol '{ symbol } ' found in layout. Aborting." )
50+ sys .exit (4 )
51+ seen .add (symbol )
52+ layout_dict [symbol ] = index
53+
54+ return layout_dict
55+
56+ # ---------- Word List Loading ----------
2457def load_entries (file_path ):
2558 with open (file_path , encoding = 'utf-8' ) as f :
2659 lines = [line .strip () for line in f if line .strip ()]
@@ -32,7 +65,7 @@ def load_entries(file_path):
3265 print_error (f"Malformed line { line_num } : '{ line } ' (expected at least 2 tab-separated fields)" )
3366 sys .exit (3 )
3467
35- chinese , latin = parts [:2 ]
68+ native , latin = parts [:2 ]
3669 number = None
3770 if len (parts ) > 2 :
3871 try :
@@ -41,16 +74,40 @@ def load_entries(file_path):
4174 print_error (f"Malformed line { line_num } : '{ line } ' (third field must be an integer if present)" )
4275 sys .exit (3 )
4376
44- entries .append ({'chinese ' : chinese , 'latin' : latin , 'number' : number })
77+ entries .append ({'native ' : native , 'latin' : latin , 'number' : number })
4578
4679 return entries
4780
48- def group_entries (entries ):
81+ # ---------- Grouping by Layout Index Pattern ----------
82+ def group_entries (entries , layout_dict ):
4983 groups = defaultdict (list )
84+
85+ # Sort symbols by length (descending) for multi-letter matching (e.g., 'Zh' before 'Z')
86+ sorted_symbols = sorted (layout_dict .keys (), key = len , reverse = True )
87+
5088 for entry in entries :
51- groups [entry ['latin' ]].append (entry )
89+ latin = entry ['latin' ]
90+ i = 0
91+ index_seq = []
92+
93+ while i < len (latin ):
94+ matched = False
95+ for symbol in sorted_symbols :
96+ if latin .startswith (symbol , i ):
97+ index_seq .append (str (layout_dict [symbol ]))
98+ i += len (symbol )
99+ matched = True
100+ break
101+ if not matched :
102+ print_error (f"Error: Unknown symbol in Latin string '{ latin } ' near '{ latin [i :]} '" )
103+ sys .exit (5 )
104+
105+ key = '' .join (index_seq )
106+ groups [key ].append (entry )
107+
52108 return groups
53109
110+ # ---------- Frequency Normalization ----------
54111def normalize_frequencies (groups ):
55112 sorted_entries = []
56113 for group in groups .values ():
@@ -67,18 +124,23 @@ def normalize_frequencies(groups):
67124
68125 return sorted_entries
69126
127+ # ---------- Output ----------
70128def print_entries (entries ):
71129 for e in entries :
72- parts = [e ['chinese ' ], e ['latin' ]]
130+ parts = [e ['native ' ], e ['latin' ]]
73131 if e ['number' ] is not None :
74- parts .append (e ['number' ])
132+ parts .append (str ( e ['number' ]) )
75133 print ('\t ' .join (parts ))
76134
135+ # ---------- Main ----------
77136def main ():
78137 args = parse_args ()
79138 validate_file (args .word_list )
139+ validate_file (args .layout_yaml )
140+
141+ layout_dict = load_layout (args .layout_yaml )
80142 entries = load_entries (args .word_list )
81- groups = group_entries (entries )
143+ groups = group_entries (entries , layout_dict )
82144 sorted_entries = normalize_frequencies (groups )
83145 print_entries (sorted_entries )
84146
0 commit comments