Skip to content

Commit c234f10

Browse files
authored
More corner cases for strip comments (#199)
1 parent df5a308 commit c234f10

File tree

2 files changed

+118
-23
lines changed

2 files changed

+118
-23
lines changed

mysql_ch_replicator/converter.py

Lines changed: 98 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1078,46 +1078,121 @@ def _strip_comments(self, create_statement):
10781078
"""
10791079
Strip COMMENT clauses from CREATE TABLE statements.
10801080
Handles MySQL-style quote escaping where quotes are doubled ('' or "").
1081+
1082+
This function properly parses SQL syntax to distinguish between:
1083+
- COMMENT clauses (which should be removed)
1084+
- String literals containing "COMMENT" (which should be preserved)
1085+
- Identifiers containing "comment" (which should be preserved)
10811086
"""
10821087
result = []
10831088
i = 0
1089+
10841090
while i < len(create_statement):
1085-
# Look for COMMENT keyword (case insensitive)
1086-
if (i + 7 < len(create_statement) and
1091+
char = create_statement[i]
1092+
1093+
# Handle string literals (single quotes)
1094+
if char == "'":
1095+
result.append(char)
1096+
i += 1
1097+
# Copy the entire string literal, handling escaped quotes
1098+
while i < len(create_statement):
1099+
char = create_statement[i]
1100+
result.append(char)
1101+
if char == "'":
1102+
# Check if this is an escaped quote (doubled)
1103+
if i + 1 < len(create_statement) and create_statement[i + 1] == "'":
1104+
i += 1 # Skip to the second quote
1105+
result.append(create_statement[i]) # Add the second quote
1106+
else:
1107+
i += 1 # End of string literal
1108+
break
1109+
i += 1
1110+
continue
1111+
1112+
# Handle string literals (double quotes)
1113+
if char == '"':
1114+
result.append(char)
1115+
i += 1
1116+
# Copy the entire string literal, handling escaped quotes
1117+
while i < len(create_statement):
1118+
char = create_statement[i]
1119+
result.append(char)
1120+
if char == '"':
1121+
# Check if this is an escaped quote (doubled)
1122+
if i + 1 < len(create_statement) and create_statement[i + 1] == '"':
1123+
i += 1 # Skip to the second quote
1124+
result.append(create_statement[i]) # Add the second quote
1125+
else:
1126+
i += 1 # End of string literal
1127+
break
1128+
i += 1
1129+
continue
1130+
1131+
# Handle backtick-quoted identifiers
1132+
if char == '`':
1133+
result.append(char)
1134+
i += 1
1135+
# Copy the entire identifier
1136+
while i < len(create_statement):
1137+
char = create_statement[i]
1138+
result.append(char)
1139+
if char == '`':
1140+
i += 1 # End of identifier
1141+
break
1142+
i += 1
1143+
continue
1144+
1145+
# Look for COMMENT keyword (case insensitive) outside of quotes
1146+
if (i + 7 <= len(create_statement) and
10871147
create_statement[i:i+7].upper() == 'COMMENT' and
1088-
(i == 0 or (not create_statement[i-1].isalnum() and create_statement[i-1] != '`')) and
1148+
(i == 0 or not create_statement[i-1].isalnum()) and
10891149
(i + 7 >= len(create_statement) or not create_statement[i+7].isalnum())):
10901150

1151+
# This looks like a COMMENT keyword, but we need to verify it's actually
1152+
# a COMMENT clause and not just an identifier that happens to be "comment"
1153+
10911154
# Skip COMMENT keyword
1092-
i += 7
1155+
j = i + 7
10931156

10941157
# Skip whitespace and optional '='
1095-
while i < len(create_statement) and create_statement[i].isspace():
1096-
i += 1
1097-
if i < len(create_statement) and create_statement[i] == '=':
1098-
i += 1
1099-
while i < len(create_statement) and create_statement[i].isspace():
1100-
i += 1
1158+
while j < len(create_statement) and create_statement[j].isspace():
1159+
j += 1
1160+
if j < len(create_statement) and create_statement[j] == '=':
1161+
j += 1
1162+
while j < len(create_statement) and create_statement[j].isspace():
1163+
j += 1
11011164

1102-
# Find the quoted string
1103-
if i < len(create_statement) and create_statement[i] in ('"', "'"):
1104-
quote_char = create_statement[i]
1105-
i += 1 # Skip opening quote
1165+
# Check if this is followed by a quoted string (indicating a COMMENT clause)
1166+
if j < len(create_statement) and create_statement[j] in ('"', "'"):
1167+
# This is a COMMENT clause - skip it entirely
1168+
quote_char = create_statement[j]
1169+
j += 1 # Skip opening quote
11061170

11071171
# Find the closing quote, handling escaped quotes
1108-
while i < len(create_statement):
1109-
if create_statement[i] == quote_char:
1172+
while j < len(create_statement):
1173+
if create_statement[j] == quote_char:
11101174
# Check if this is an escaped quote (doubled)
1111-
if i + 1 < len(create_statement) and create_statement[i + 1] == quote_char:
1112-
i += 2 # Skip both quotes
1175+
if j + 1 < len(create_statement) and create_statement[j + 1] == quote_char:
1176+
j += 2 # Skip both quotes
11131177
else:
1114-
i += 1 # Skip closing quote
1178+
j += 1 # Skip closing quote
11151179
break
11161180
else:
1117-
i += 1
1118-
else:
1119-
result.append(create_statement[i])
1120-
i += 1
1181+
j += 1
1182+
1183+
# Skip the entire COMMENT clause
1184+
i = j
1185+
continue
1186+
else:
1187+
# This is not a COMMENT clause (no quoted string follows)
1188+
# Treat it as a regular identifier
1189+
result.append(char)
1190+
i += 1
1191+
continue
1192+
1193+
# Regular character - just copy it
1194+
result.append(char)
1195+
i += 1
11211196

11221197
return ''.join(result)
11231198

test_mysql_ch_replicator.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3329,6 +3329,26 @@ def test_charset_configuration():
33293329
PRIMARY KEY (departments,termine)
33303330
)"""
33313331
),
3332+
# COMMENT keyword inside string literals (critical edge case)
3333+
(
3334+
"CREATE TABLE test (id int DEFAULT 'COMMENT test', name varchar(255))",
3335+
"CREATE TABLE test (id int DEFAULT 'COMMENT test', name varchar(255))"
3336+
),
3337+
# Unquoted column name 'comment' (critical edge case)
3338+
(
3339+
"CREATE TABLE test (comment varchar(255), id int)",
3340+
"CREATE TABLE test (comment varchar(255), id int)"
3341+
),
3342+
# COMMENT in DEFAULT with complex content (critical edge case)
3343+
(
3344+
"CREATE TABLE test (status varchar(50) DEFAULT 'COMMENT: active', id int)",
3345+
"CREATE TABLE test (status varchar(50) DEFAULT 'COMMENT: active', id int)"
3346+
),
3347+
# Multiple string literals containing COMMENT (critical edge case)
3348+
(
3349+
"CREATE TABLE test (col1 varchar(50) DEFAULT 'COMMENT 1', col2 varchar(50) DEFAULT 'COMMENT 2')",
3350+
"CREATE TABLE test (col1 varchar(50) DEFAULT 'COMMENT 1', col2 varchar(50) DEFAULT 'COMMENT 2')"
3351+
),
33323352
])
33333353
def test_strip_comments_function(input_sql, expected_output):
33343354
"""

0 commit comments

Comments
 (0)