Skip to content

Commit d7139b8

Browse files
authored
Fixed comments handling (#196)
1 parent 4b0cf84 commit d7139b8

File tree

2 files changed

+189
-6
lines changed

2 files changed

+189
-6
lines changed

mysql_ch_replicator/converter.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,8 +1075,51 @@ def convert_drop_table_query(self, mysql_query):
10751075
raise Exception('not implement')
10761076

10771077
def _strip_comments(self, create_statement):
1078-
pattern = r'\bCOMMENT(?:\s*=\s*|\s+)([\'"])(?:\\.|[^\\])*?\1'
1079-
return re.sub(pattern, '', create_statement, flags=re.IGNORECASE)
1078+
"""
1079+
Strip COMMENT clauses from CREATE TABLE statements.
1080+
Handles MySQL-style quote escaping where quotes are doubled ('' or "").
1081+
"""
1082+
result = []
1083+
i = 0
1084+
while i < len(create_statement):
1085+
# Look for COMMENT keyword (case insensitive)
1086+
if (i + 7 < len(create_statement) and
1087+
create_statement[i:i+7].upper() == 'COMMENT' and
1088+
(i == 0 or not create_statement[i-1].isalnum()) and
1089+
(i + 7 >= len(create_statement) or not create_statement[i+7].isalnum())):
1090+
1091+
# Skip COMMENT keyword
1092+
i += 7
1093+
1094+
# Skip whitespace and optional '='
1095+
while i < len(create_statement) and create_statement[i].isspace():
1096+
i += 1
1097+
if i < len(create_statement) and create_statement[i] == '=':
1098+
i += 1
1099+
while i < len(create_statement) and create_statement[i].isspace():
1100+
i += 1
1101+
1102+
# Find the quoted string
1103+
if i < len(create_statement) and create_statement[i] in ('"', "'"):
1104+
quote_char = create_statement[i]
1105+
i += 1 # Skip opening quote
1106+
1107+
# Find the closing quote, handling escaped quotes
1108+
while i < len(create_statement):
1109+
if create_statement[i] == quote_char:
1110+
# Check if this is an escaped quote (doubled)
1111+
if i + 1 < len(create_statement) and create_statement[i + 1] == quote_char:
1112+
i += 2 # Skip both quotes
1113+
else:
1114+
i += 1 # Skip closing quote
1115+
break
1116+
else:
1117+
i += 1
1118+
else:
1119+
result.append(create_statement[i])
1120+
i += 1
1121+
1122+
return ''.join(result)
10801123

10811124
def parse_mysql_table_structure(self, create_statement, required_table_name=None):
10821125
create_statement = self._strip_comments(create_statement)

test_mysql_ch_replicator.py

Lines changed: 144 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -492,13 +492,13 @@ def test_multi_column_erase():
492492

493493
prepare_env(cfg, mysql, ch)
494494

495-
mysql.execute(f'''
495+
mysql.execute(f"""
496496
CREATE TABLE `{TEST_TABLE_NAME}` (
497-
departments int(11) NOT NULL,
498-
termine int(11) NOT NULL,
497+
departments int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''',
498+
termine int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''',
499499
PRIMARY KEY (departments,termine)
500500
)
501-
''')
501+
""")
502502

503503

504504
mysql.execute(f"INSERT INTO `{TEST_TABLE_NAME}` (departments, termine) VALUES (10, 20);", commit=True)
@@ -3082,3 +3082,143 @@ def test_resume_initial_replication_with_ignore_deletes():
30823082
finally:
30833083
# Clean up temp config file
30843084
os.unlink(config_file)
3085+
3086+
3087+
@pytest.mark.parametrize("input_sql,expected_output", [
3088+
# Basic single quote comment
3089+
(
3090+
"CREATE TABLE test (id int NOT NULL COMMENT 'Simple comment', name varchar(255))",
3091+
"CREATE TABLE test (id int NOT NULL , name varchar(255))"
3092+
),
3093+
# Basic double quote comment
3094+
(
3095+
"CREATE TABLE test (id int NOT NULL COMMENT \"Simple comment\", name varchar(255))",
3096+
"CREATE TABLE test (id int NOT NULL , name varchar(255))"
3097+
),
3098+
# Comment with escaped single quotes (the original bug case)
3099+
(
3100+
"CREATE TABLE test (id int NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''', name varchar(255))",
3101+
"CREATE TABLE test (id int NOT NULL , name varchar(255))"
3102+
),
3103+
# Comment with escaped double quotes
3104+
(
3105+
"CREATE TABLE test (id int NOT NULL COMMENT \"Value can be: \"\"ACTIVE\"\" or \"\"INACTIVE\"\"\", name varchar(255))",
3106+
"CREATE TABLE test (id int NOT NULL , name varchar(255))"
3107+
),
3108+
# Multiple comments in same table
3109+
(
3110+
"""CREATE TABLE test (
3111+
id int NOT NULL COMMENT 'Primary key',
3112+
name varchar(255) COMMENT 'User name',
3113+
status enum('active','inactive') COMMENT 'Status with ''quotes'''
3114+
)""",
3115+
"""CREATE TABLE test (
3116+
id int NOT NULL ,
3117+
name varchar(255) ,
3118+
status enum('active','inactive')
3119+
)"""
3120+
),
3121+
# Comment with COMMENT = syntax
3122+
(
3123+
"CREATE TABLE test (id int NOT NULL COMMENT = 'Primary key', name varchar(255))",
3124+
"CREATE TABLE test (id int NOT NULL , name varchar(255))"
3125+
),
3126+
# Comment with mixed quotes and special characters
3127+
(
3128+
"CREATE TABLE test (id int COMMENT 'Mixed: ''single'', \"double\", and `backtick`', name text)",
3129+
"CREATE TABLE test (id int , name text)"
3130+
),
3131+
# Multiline comment
3132+
(
3133+
"""CREATE TABLE test (
3134+
id int NOT NULL COMMENT 'This is a
3135+
multiline comment
3136+
with newlines',
3137+
name varchar(255)
3138+
)""",
3139+
"""CREATE TABLE test (
3140+
id int NOT NULL ,
3141+
name varchar(255)
3142+
)"""
3143+
),
3144+
# Comment with Unicode characters
3145+
(
3146+
"CREATE TABLE test (id int COMMENT '用户ID - 主键', name varchar(255) COMMENT 'Имя пользователя')",
3147+
"CREATE TABLE test (id int , name varchar(255) )"
3148+
),
3149+
# No comments (should remain unchanged)
3150+
(
3151+
"CREATE TABLE test (id int NOT NULL, name varchar(255))",
3152+
"CREATE TABLE test (id int NOT NULL, name varchar(255))"
3153+
),
3154+
# Comment at table level
3155+
(
3156+
"CREATE TABLE test (id int NOT NULL, name varchar(255)) COMMENT 'Table comment'",
3157+
"CREATE TABLE test (id int NOT NULL, name varchar(255)) "
3158+
),
3159+
# Complex case with multiple escaped quotes and special characters
3160+
(
3161+
"""CREATE TABLE test (
3162+
departments int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''',
3163+
termine int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''',
3164+
PRIMARY KEY (departments,termine)
3165+
)""",
3166+
"""CREATE TABLE test (
3167+
departments int(11) NOT NULL ,
3168+
termine int(11) NOT NULL ,
3169+
PRIMARY KEY (departments,termine)
3170+
)"""
3171+
),
3172+
# Comment with JSON-like content
3173+
(
3174+
"CREATE TABLE test (config json COMMENT '{\"type\": \"config\", \"values\": [\"a\", \"b\"]}', id int)",
3175+
"CREATE TABLE test (config json , id int)"
3176+
),
3177+
# Comment with SQL injection-like content (should be safely handled)
3178+
(
3179+
"CREATE TABLE test (id int COMMENT 'DROP TABLE users; --', name varchar(255))",
3180+
"CREATE TABLE test (id int , name varchar(255))"
3181+
),
3182+
# Empty comment
3183+
(
3184+
"CREATE TABLE test (id int COMMENT '', name varchar(255))",
3185+
"CREATE TABLE test (id int , name varchar(255))"
3186+
),
3187+
# Comment with only spaces
3188+
(
3189+
"CREATE TABLE test (id int COMMENT ' ', name varchar(255))",
3190+
"CREATE TABLE test (id int , name varchar(255))"
3191+
),
3192+
# Case insensitive COMMENT keyword
3193+
(
3194+
"CREATE TABLE test (id int comment 'lowercase', name varchar(255) Comment 'Mixed case')",
3195+
"CREATE TABLE test (id int , name varchar(255) )"
3196+
),
3197+
])
3198+
def test_strip_comments_function(input_sql, expected_output):
3199+
"""
3200+
Test the _strip_comments function with various realistic scenarios.
3201+
3202+
This test covers:
3203+
- Basic single and double quoted comments
3204+
- Escaped quotes within comments (MySQL style with doubled quotes)
3205+
- Multiple comments in the same table
3206+
- COMMENT = syntax
3207+
- Multiline comments with newlines
3208+
- Unicode characters in comments
3209+
- Table-level comments
3210+
- Complex real-world scenarios
3211+
- Edge cases like empty comments and case variations
3212+
"""
3213+
from mysql_ch_replicator.converter import MysqlToClickhouseConverter
3214+
3215+
converter = MysqlToClickhouseConverter()
3216+
result = converter._strip_comments(input_sql)
3217+
3218+
# Normalize whitespace for comparison (remove extra spaces that might be left behind)
3219+
def normalize_whitespace(text):
3220+
import re
3221+
# Replace multiple spaces with single space, but preserve newlines
3222+
return re.sub(r'[ \t]+', ' ', text).strip()
3223+
3224+
assert normalize_whitespace(result) == normalize_whitespace(expected_output), f"Failed for input: {input_sql}"

0 commit comments

Comments
 (0)