Skip to content

Commit 6fce777

Browse files
authored
Better enum handling (bakwc#106)
1 parent 050b144 commit 6fce777

File tree

4 files changed

+233
-11
lines changed

4 files changed

+233
-11
lines changed

mysql_ch_replicator/converter.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pyparsing import Suppress, CaselessKeyword, Word, alphas, alphanums, delimitedList
77

88
from .table_structure import TableStructure, TableField
9+
from .converter_enum_parser import parse_mysql_enum
910

1011

1112
CHARSET_MYSQL_TO_PYTHON = {
@@ -239,8 +240,14 @@ def convert_type(self, mysql_type, parameters):
239240
return 'String'
240241
if 'varchar' in mysql_type:
241242
return 'String'
242-
if 'enum' in mysql_type:
243-
return 'String'
243+
if mysql_type.startswith('enum'):
244+
enum_values = parse_mysql_enum(mysql_type)
245+
ch_enum_values = []
246+
for idx, value_name in enumerate(enum_values):
247+
ch_enum_values.append(f"'{value_name}' = {idx+1}")
248+
ch_enum_values = ', '.join(ch_enum_values)
249+
# Enum8('red' = 1, 'green' = 2, 'black' = 3)
250+
return f'Enum8({ch_enum_values})'
244251
if 'text' in mysql_type:
245252
return 'String'
246253
if 'blob' in mysql_type:
@@ -376,9 +383,13 @@ def convert_record(
376383
]
377384
clickhouse_field_value = ','.join(clickhouse_field_value)
378385

379-
if 'point' in mysql_field_type:
386+
if mysql_field_type.startswith('point'):
380387
clickhouse_field_value = parse_mysql_point(clickhouse_field_value)
381388

389+
if mysql_field_type.startswith('enum(') and isinstance(clickhouse_field_value, int):
390+
enum_values = mysql_structure.fields[idx].additional_data
391+
clickhouse_field_value = enum_values[int(clickhouse_field_value)-1]
392+
382393
clickhouse_record.append(clickhouse_field_value)
383394
return tuple(clickhouse_record)
384395

@@ -745,6 +756,9 @@ def vstrip(e):
745756
vals = [vstrip(v) for v in vals]
746757
additional_data = vals
747758

759+
if field_type.lower().startswith('enum('):
760+
additional_data = parse_mysql_enum(field_type)
761+
748762
structure.fields.append(TableField(
749763
name=field_name,
750764
field_type=field_type,
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
2+
3+
def parse_mysql_enum(enum_definition):
4+
"""
5+
Accepts a MySQL ENUM definition string (case–insensitive),
6+
for example:
7+
enum('point','qwe','def')
8+
ENUM("asd", 'qwe', "def")
9+
enum(`point`,`qwe`,`def`)
10+
and returns a list of strings like:
11+
['point', 'qwe', 'def']
12+
13+
Note:
14+
- For single- and double–quoted values, backslash escapes are handled.
15+
- For backtick–quoted values, only doubling (``) is recognized as escaping.
16+
"""
17+
# First, trim any whitespace.
18+
s = enum_definition.strip()
19+
20+
# Check that the string begins with "enum" (case–insensitive)
21+
if not s[:4].lower() == "enum":
22+
raise ValueError("String does not start with 'enum'")
23+
24+
# Find the first opening parenthesis.
25+
pos = s.find('(')
26+
if pos == -1:
27+
raise ValueError("Missing '(' in the enum definition")
28+
29+
# Extract the text inside the outer parenthesis.
30+
# We use a helper to extract the contents taking into account
31+
# that quotes (of any supported type) and escapes may appear.
32+
inner_content, next_index = _extract_parenthesized_content(s, pos)
33+
# Optionally, you can check that only whitespace follows next_index.
34+
35+
# Now parse out the comma–separated string literals.
36+
return _parse_enum_values(inner_content)
37+
38+
39+
def _extract_parenthesized_content(s, start_index):
40+
"""
41+
Given a string s and the index of a '(' in it,
42+
return a tuple (content, pos) where content is the substring
43+
inside the outer matching parentheses and pos is the index
44+
immediately after the matching closing ')'.
45+
46+
This function takes special care to ignore any parentheses
47+
that occur inside quotes (a quoted literal is any part enclosed by
48+
', " or `) and also to skip over escape sequences in single/double quotes.
49+
(Backticks do not process backslash escapes.)
50+
"""
51+
if s[start_index] != '(':
52+
raise ValueError("Expected '(' at position {}".format(start_index))
53+
depth = 1
54+
i = start_index + 1
55+
content_start = i
56+
in_quote = None # will be set to a quoting character when inside a quoted literal
57+
58+
# Allow these quote characters.
59+
allowed_quotes = ("'", '"', '`')
60+
61+
while i < len(s):
62+
c = s[i]
63+
if in_quote:
64+
# Inside a quoted literal.
65+
if in_quote in ("'", '"'):
66+
if c == '\\':
67+
# Skip the escape character and the next character.
68+
i += 2
69+
continue
70+
# Whether we are in a backtick or one of the other quotes,
71+
# check for the closing quote.
72+
if c == in_quote:
73+
# Check for a doubled quote.
74+
if i + 1 < len(s) and s[i + 1] == in_quote:
75+
i += 2
76+
continue
77+
else:
78+
in_quote = None
79+
i += 1
80+
continue
81+
else:
82+
i += 1
83+
continue
84+
else:
85+
# Not inside a quoted literal.
86+
if c in allowed_quotes:
87+
in_quote = c
88+
i += 1
89+
continue
90+
elif c == '(':
91+
depth += 1
92+
i += 1
93+
continue
94+
elif c == ')':
95+
depth -= 1
96+
i += 1
97+
if depth == 0:
98+
# Return the substring inside (excluding the outer parentheses)
99+
return s[content_start:i - 1], i
100+
continue
101+
else:
102+
i += 1
103+
104+
raise ValueError("Unbalanced parentheses in enum definition")
105+
106+
107+
def _parse_enum_values(content):
108+
"""
109+
Given the inner text from an ENUM declaration—for example:
110+
"'point', 'qwe', 'def'"
111+
parse and return a list of the string values as MySQL would see them.
112+
113+
This function handles:
114+
- For single- and double–quoted strings: backslash escapes and doubled quotes.
115+
- For backtick–quoted identifiers: only doubled backticks are recognized.
116+
"""
117+
values = []
118+
i = 0
119+
allowed_quotes = ("'", '"', '`')
120+
while i < len(content):
121+
# Skip any whitespace.
122+
while i < len(content) and content[i].isspace():
123+
i += 1
124+
if i >= len(content):
125+
break
126+
# The next non–whitespace character must be one of the allowed quotes.
127+
if content[i] not in allowed_quotes:
128+
raise ValueError("Expected starting quote for enum value at position {} in {!r}".format(i, content))
129+
quote = content[i]
130+
i += 1 # skip the opening quote
131+
132+
literal_chars = []
133+
while i < len(content):
134+
c = content[i]
135+
# For single- and double–quotes, process backslash escapes.
136+
if quote in ("'", '"') and c == '\\':
137+
if i + 1 < len(content):
138+
next_char = content[i + 1]
139+
# Mapping for common escapes. (For the quote character, map it to itself.)
140+
escapes = {
141+
'0': '\0',
142+
'b': '\b',
143+
'n': '\n',
144+
'r': '\r',
145+
't': '\t',
146+
'Z': '\x1a',
147+
'\\': '\\',
148+
quote: quote
149+
}
150+
literal_chars.append(escapes.get(next_char, next_char))
151+
i += 2
152+
continue
153+
else:
154+
# Trailing backslash – treat it as literal.
155+
literal_chars.append('\\')
156+
i += 1
157+
continue
158+
elif c == quote:
159+
# Check for a doubled quote (works for all three quoting styles).
160+
if i + 1 < len(content) and content[i + 1] == quote:
161+
literal_chars.append(quote)
162+
i += 2
163+
continue
164+
else:
165+
i += 1 # skip the closing quote
166+
break # end of this literal
167+
else:
168+
# For backticks, we do not treat backslashes specially.
169+
literal_chars.append(c)
170+
i += 1
171+
# Finished reading one literal; join the characters.
172+
value = ''.join(literal_chars)
173+
values.append(value)
174+
175+
# Skip whitespace after the literal.
176+
while i < len(content) and content[i].isspace():
177+
i += 1
178+
# If there’s a comma, skip it; otherwise, we must be at the end.
179+
if i < len(content):
180+
if content[i] == ',':
181+
i += 1
182+
else:
183+
raise ValueError("Expected comma between enum values at position {} in {!r}"
184+
.format(i, content))
185+
return values
186+
187+
188+
# --- For testing purposes ---
189+
if __name__ == '__main__':
190+
tests = [
191+
"enum('point','qwe','def')",
192+
"ENUM('asd', 'qwe', 'def')",
193+
'enum("first", \'second\', "Don""t stop")',
194+
"enum('a\\'b','c\\\\d','Hello\\nWorld')",
195+
# Now with backticks:
196+
"enum(`point`,`qwe`,`def`)",
197+
"enum('point',`qwe`,'def')",
198+
"enum(`first`, `Don``t`, `third`)",
199+
]
200+
201+
for t in tests:
202+
try:
203+
result = parse_mysql_enum(t)
204+
print("Input: {}\nParsed: {}\n".format(t, result))
205+
except Exception as e:
206+
print("Error parsing {}: {}\n".format(t, e))

mysql_ch_replicator/pymysqlreplication/row_event.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -258,10 +258,9 @@ def __read_values_name(
258258
elif column.type == FIELD_TYPE.YEAR:
259259
return self.packet.read_uint8() + 1900
260260
elif column.type == FIELD_TYPE.ENUM:
261-
if column.enum_values:
262-
return column.enum_values[self.packet.read_uint_by_size(column.size)]
263-
self.packet.read_uint_by_size(column.size)
264-
return None
261+
# if column.enum_values:
262+
# return column.enum_values[self.packet.read_uint_by_size(column.size)]
263+
return self.packet.read_uint_by_size(column.size)
265264
elif column.type == FIELD_TYPE.SET:
266265
bit_mask = self.packet.read_uint_by_size(column.size)
267266
if column.set_values:

test_mysql_ch_replicator.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -981,13 +981,14 @@ def test_different_types_2():
981981
test4 set('1','2','3','4','5','6','7'),
982982
test5 timestamp(0),
983983
test6 char(36),
984+
test7 ENUM('point', 'qwe', 'def'),
984985
PRIMARY KEY (id)
985986
);
986987
''')
987988

988989
mysql.execute(
989-
f"INSERT INTO `{TEST_TABLE_NAME}` (test1, test2, test3, test4, test5, test6) VALUES "
990-
f"(0, POINT(10.0, 20.0), 'azaza', '1,3,5', '2023-08-15 14:30:00', '550e8400-e29b-41d4-a716-446655440000');",
990+
f"INSERT INTO `{TEST_TABLE_NAME}` (test1, test2, test3, test4, test5, test6, test7) VALUES "
991+
f"(0, POINT(10.0, 20.0), 'azaza', '1,3,5', '2023-08-15 14:30:00', '550e8400-e29b-41d4-a716-446655440000', 'def');",
991992
commit=True,
992993
)
993994

@@ -1004,16 +1005,18 @@ def test_different_types_2():
10041005
assert_wait(lambda: len(ch.select(TEST_TABLE_NAME)) == 1)
10051006

10061007
mysql.execute(
1007-
f"INSERT INTO `{TEST_TABLE_NAME}` (test1, test2, test4, test5, test6) VALUES "
1008-
f"(1, POINT(15.0, 14.0), '2,4,5', '2023-08-15 14:40:00', '110e6103-e39b-51d4-a716-826755413099');",
1008+
f"INSERT INTO `{TEST_TABLE_NAME}` (test1, test2, test4, test5, test6, test7) VALUES "
1009+
f"(1, POINT(15.0, 14.0), '2,4,5', '2023-08-15 14:40:00', '110e6103-e39b-51d4-a716-826755413099', 'point');",
10091010
commit=True,
10101011
)
10111012

10121013
assert_wait(lambda: len(ch.select(TEST_TABLE_NAME)) == 2)
10131014
assert_wait(lambda: len(ch.select(TEST_TABLE_NAME, 'test1=True')) == 1)
10141015

10151016
assert ch.select(TEST_TABLE_NAME, 'test1=True')[0]['test2']['x'] == 15.0
1017+
assert ch.select(TEST_TABLE_NAME, 'test1=True')[0]['test7'] == 'point'
10161018
assert ch.select(TEST_TABLE_NAME, 'test1=False')[0]['test2']['y'] == 20.0
1019+
assert ch.select(TEST_TABLE_NAME, 'test1=False')[0]['test7'] == 'def'
10171020
assert ch.select(TEST_TABLE_NAME, 'test1=False')[0]['test3'] == 'azaza\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
10181021

10191022
assert ch.select(TEST_TABLE_NAME, 'test1=True')[0]['test4'] == '2,4,5'

0 commit comments

Comments
 (0)