Skip to content

Commit a71c1d9

Browse files
authored
Support for more charsets (#46)
1 parent 0034933 commit a71c1d9

File tree

2 files changed

+51
-3
lines changed

2 files changed

+51
-3
lines changed

mysql_ch_replicator/converter.py

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,51 @@
77
from .table_structure import TableStructure, TableField
88

99

10+
CHARSET_MYSQL_TO_PYTHON = {
11+
'armscii8': None, # ARMSCII-8 is not directly supported in Python
12+
'ascii': 'ascii',
13+
'big5': 'big5',
14+
'binary': 'latin1', # Treat binary data as Latin-1 in Python
15+
'cp1250': 'cp1250',
16+
'cp1251': 'cp1251',
17+
'cp1256': 'cp1256',
18+
'cp1257': 'cp1257',
19+
'cp850': 'cp850',
20+
'cp852': 'cp852',
21+
'cp866': 'cp866',
22+
'cp932': 'cp932',
23+
'dec8': 'latin1', # DEC8 is similar to Latin-1
24+
'eucjpms': 'euc_jp', # Map to EUC-JP
25+
'euckr': 'euc_kr',
26+
'gb18030': 'gb18030',
27+
'gb2312': 'gb2312',
28+
'gbk': 'gbk',
29+
'geostd8': None, # GEOSTD8 is not directly supported in Python
30+
'greek': 'iso8859_7',
31+
'hebrew': 'iso8859_8',
32+
'hp8': None, # HP8 is not directly supported in Python
33+
'keybcs2': None, # KEYBCS2 is not directly supported in Python
34+
'koi8r': 'koi8_r',
35+
'koi8u': 'koi8_u',
36+
'latin1': 'cp1252', # MySQL's latin1 corresponds to Windows-1252
37+
'latin2': 'iso8859_2',
38+
'latin5': 'iso8859_9',
39+
'latin7': 'iso8859_13',
40+
'macce': 'mac_latin2',
41+
'macroman': 'mac_roman',
42+
'sjis': 'shift_jis',
43+
'swe7': None, # SWE7 is not directly supported in Python
44+
'tis620': 'tis_620',
45+
'ucs2': 'utf_16', # UCS-2 can be mapped to UTF-16
46+
'ujis': 'euc_jp',
47+
'utf16': 'utf_16',
48+
'utf16le': 'utf_16_le',
49+
'utf32': 'utf_32',
50+
'utf8mb3': 'utf_8', # Both utf8mb3 and utf8mb4 can be mapped to UTF-8
51+
'utf8mb4': 'utf_8',
52+
}
53+
54+
1055
def convert_bytes(obj):
1156
if isinstance(obj, dict):
1257
new_obj = {}
@@ -272,7 +317,7 @@ def convert_record(
272317
'text' in mysql_field_type or 'char' in mysql_field_type
273318
):
274319
if isinstance(clickhouse_field_value, bytes):
275-
charset = mysql_structure.charset or 'utf-8'
320+
charset = mysql_structure.charset_python
276321
clickhouse_field_value = clickhouse_field_value.decode(charset)
277322

278323
if 'point' in mysql_field_type:
@@ -541,8 +586,10 @@ def parse_mysql_table_structure(self, create_statement, required_table_name=None
541586
prev_prev_token = prev_token
542587
prev_token = curr_token
543588

544-
if structure.charset.startswith('utf8'):
545-
structure.charset = 'utf-8'
589+
structure.charset_python = 'utf-8'
590+
591+
if structure.charset:
592+
structure.charset_python = CHARSET_MYSQL_TO_PYTHON[structure.charset]
546593

547594
for line in inner_tokens:
548595
if line.lower().startswith('unique key'):

mysql_ch_replicator/table_structure.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ class TableStructure:
1313
primary_key_ids: int = 0
1414
table_name: str = ''
1515
charset: str = ''
16+
charset_python: str = ''
1617

1718
def preprocess(self):
1819
field_names = [f.name for f in self.fields]

0 commit comments

Comments
 (0)