@@ -9,11 +9,9 @@ And change the actual field types to 32-bits. This decreases the hash
99tables in size.
1010---
1111 source/lexbor/core/shs.h | 4 +-
12- utils/lexbor/encoding/multi-byte.py | 181 +++++++++++++++++++++++++++
1312 utils/lexbor/encoding/single-byte.py | 2 +-
1413 utils/lexbor/lexbor/LXB.py | 12 +-
15- 4 files changed, 193 insertions(+), 6 deletions(-)
16- create mode 100755 utils/lexbor/encoding/multi-byte.py
14+ 3 files changed, 12 insertions(+), 6 deletions(-)
1715
1816diff --git a/source/lexbor/core/shs.h b/source/lexbor/core/shs.h
1917index 7a63a07..c84dfaa 100644
@@ -31,193 +29,6 @@ index 7a63a07..c84dfaa 100644
3129 }
3230 lexbor_shs_hash_t;
3331
34- diff --git a/utils/lexbor/encoding/multi-byte.py b/utils/lexbor/encoding/multi-byte.py
35- new file mode 100755
36- index 0000000..f8af2d4
37- --- /dev/null
38- +++ b/utils/lexbor/encoding/multi-byte.py
39- @@ -0,0 +1,181 @@
40- +
41- + import sys, re, os
42- +
43- + # Find and append run script run dir to module search path
44- + ABS_PATH = os.path.dirname(os.path.abspath(__file__))
45- + sys.path.append("{}/../lexbor/".format(ABS_PATH))
46- +
47- + import LXB
48- +
49- + class MultiByte:
50- + var_name_prefix = 'lxb_encoding_multi_index_'
51- + hash_name_prefix = 'lxb_encoding_multi_hash_'
52- + flat_index_typename = 'lxb_encoding_multi_index_t'
53- +
54- + def __init__(self, dir_path, temp_file_h, temp_file_c, save_to, silent = False):
55- + if not os.path.isdir(dir_path):
56- + raise Exception('Directory "{}" not exit'.format(dir_path))
57- +
58- + self.dir_path = dir_path
59- + self.silent = silent
60- + self.temp_file_h = temp_file_h
61- + self.temp_file_c = temp_file_c
62- + self.save_to = save_to
63- +
64- + def make(self):
65- + buf = []
66- + externs = []
67- + hash_externs = []
68- + hash_buf = []
69- + hash_sizes = []
70- + dir_path = self.dir_path
71- +
72- + for f in sorted(os.listdir(dir_path)):
73- + f_path = os.path.join(dir_path, f)
74- + if not os.path.isfile(f_path):
75- + continue
76- +
77- + print('File: {}'.format(f_path))
78- +
79- + idx = 0
80- + values = {'buffer_size': idx, 'max_size': 0}
81- +
82- + fo = open(f_path, "rb")
83- +
84- + for lineno, line in enumerate(fo):
85- + line = line.rstrip()
86- + if not line or line[:1] == b'#':
87- + continue
88- +
89- + entries = line.split(b'\t')
90- + captions = entries[-1].split(b' ', maxsplit=1)
91- +
92- + if not captions[0]:
93- + raise Exception('Failed to get chars variant on line {}'.format(lineno))
94- +
95- + entries.append(captions[0])
96- + entries.append(captions[1] if captions[1] else b'')
97- +
98- + entries[2] = entries[3]
99- + entries[3] = len(entries[2])
100- +
101- + idx = int(entries[0].decode('utf-8'))
102- + values[idx] = entries
103- +
104- + if values['max_size'] < idx:
105- + values['max_size'] = idx
106- +
107- + values['buffer_len'] = len(values)
108- +
109- + flat_name = self.make_name(f)
110- + res = self.make_flat_index(flat_name, values)
111- + hash_index = self.make_hash_index(flat_name, values)
112- +
113- + buf.append(''.join(res))
114- + externs.append('{};'.format(self.make_extern_name(flat_name, self.buffer_size(values))))
115- + hash_buf.append(''.join(hash_index[0]))
116- + hash_sizes.append(hash_index[1])
117- + hash_externs.append(hash_index[2])
118- +
119- + self.save_res(flat_name, ''.join(res), ''.join(hash_index[0]))
120- +
121- + fo.close()
122- +
123- + externs.append('')
124- + externs += hash_externs
125- +
126- + save_to_h = os.path.join(self.save_to, "multi.h")
127- +
128- + lxb_temp = LXB.Temp(self.temp_file_h, save_to_h)
129- + lxb_temp.pattern_append("%%EXTERNS%%", '\n'.join(externs))
130- + lxb_temp.pattern_append("%%SIZES%%", '\n'.join(hash_sizes))
131- + lxb_temp.build()
132- + lxb_temp.save()
133- +
134- + return [buf, externs, hash_buf, hash_sizes]
135- +
136- + def make_name(self, filename):
137- + name = re.sub("[^a-zA-Z0-9]", "_", filename)
138- + name = re.sub("^index_", "", name)
139- + name = re.sub("_txt$", "", name)
140- +
141- + return name
142- +
143- + def make_var_name(self, name):
144- + return '{}{}'.format(self.var_name_prefix, name)
145- +
146- + def make_hash_name(self, name):
147- + return '{}{}'.format(self.hash_name_prefix, name)
148- +
149- + def make_extern_name(self, name, buffer_size):
150- + var_name = self.make_var_name(name) + '[{}]'.format(buffer_size)
151- +
152- + return 'LXB_EXTERN const {} {}'.format(self.flat_index_typename, var_name)
153- +
154- + def buffer_size(self, values):
155- + return values['max_size'] + 1
156- +
157- + def make_flat_index(self, name, values):
158- + buffer_size = self.buffer_size(values)
159- +
160- + print("Flat buffer size:", buffer_size)
161- +
162- + res = LXB.Res(self.flat_index_typename,
163- + self.make_var_name(name) + '[{}]'.format(buffer_size), False, None, 'LXB_API')
164- +
165- + for idx in range(0, buffer_size):
166- + if idx in values:
167- + entries = values[idx]
168- +
169- + assert len(entries[2]) <= 4
170- + res.append('{{{{{}}}, {}, {}}}'.format(toHex(entries[2].decode('utf-8')),
171- + entries[3], entries[1].decode('utf-8')))
172- + res.append('/* {} */'.format(entries[4].decode('utf-8')), is_comment = True)
173- + else:
174- + res.append('{{0}, 0, LXB_ENCODING_ERROR_CODEPOINT}')
175- + res.append('/* Not defined */', is_comment = True)
176- +
177- + buf = res.create()
178- +
179- + return buf
180- +
181- + def make_hash_index(self, name, values):
182- + buffer_size = self.buffer_size(values)
183- + name = self.make_hash_name(name)
184- +
185- + hash_key = LXB.HashKey(buffer_size, name, 'LXB_API')
186- +
187- + for idx in range(0, buffer_size):
188- + if idx in values:
189- + entries = values[idx]
190- +
191- + key_id = entries[1].decode('utf-8')
192- +
193- + hash_key.append(key_id, idx)
194- +
195- + return hash_key.create(rate = 1)
196- +
197- + def save_res(self, filename, buf, hash_buf):
198- + save_to_c = os.path.join(self.save_to, filename + ".c")
199- +
200- + print("Save to:", save_to_c)
201- +
202- + lxb_temp = LXB.Temp(self.temp_file_c, save_to_c)
203- + lxb_temp.pattern_append("%%INDEX%%", buf)
204- + lxb_temp.pattern_append("%%HASH%%", hash_buf)
205- + lxb_temp.build()
206- + lxb_temp.save()
207- +
208- + def toHex(s):
209- + lst = []
210- +
211- + for ch in bytes(s, 'utf-8'):
212- + hv = hex(ch).replace('0x', '\\\\x')
213- + lst.append("'{}'".format(hv))
214- +
215- + return ', '.join(lst)
216- +
217- + if __name__ == "__main__":
218- + sb = MultiByte("multi-byte", "tmp/multi.h", "tmp/multi.c",
219- + "../../../source/lexbor/encoding")
220- + sb.make()
22132diff --git a/utils/lexbor/encoding/single-byte.py b/utils/lexbor/encoding/single-byte.py
22233index 9a85d54..ec2023c 100755
22334--- a/utils/lexbor/encoding/single-byte.py
0 commit comments