|
7 | 7 | from .table_structure import TableStructure, TableField |
8 | 8 |
|
9 | 9 |
|
| 10 | +CHARSET_MYSQL_TO_PYTHON = { |
| 11 | + 'armscii8': None, # ARMSCII-8 is not directly supported in Python |
| 12 | + 'ascii': 'ascii', |
| 13 | + 'big5': 'big5', |
| 14 | + 'binary': 'latin1', # Treat binary data as Latin-1 in Python |
| 15 | + 'cp1250': 'cp1250', |
| 16 | + 'cp1251': 'cp1251', |
| 17 | + 'cp1256': 'cp1256', |
| 18 | + 'cp1257': 'cp1257', |
| 19 | + 'cp850': 'cp850', |
| 20 | + 'cp852': 'cp852', |
| 21 | + 'cp866': 'cp866', |
| 22 | + 'cp932': 'cp932', |
| 23 | + 'dec8': 'latin1', # DEC8 is similar to Latin-1 |
| 24 | + 'eucjpms': 'euc_jp', # Map to EUC-JP |
| 25 | + 'euckr': 'euc_kr', |
| 26 | + 'gb18030': 'gb18030', |
| 27 | + 'gb2312': 'gb2312', |
| 28 | + 'gbk': 'gbk', |
| 29 | + 'geostd8': None, # GEOSTD8 is not directly supported in Python |
| 30 | + 'greek': 'iso8859_7', |
| 31 | + 'hebrew': 'iso8859_8', |
| 32 | + 'hp8': None, # HP8 is not directly supported in Python |
| 33 | + 'keybcs2': None, # KEYBCS2 is not directly supported in Python |
| 34 | + 'koi8r': 'koi8_r', |
| 35 | + 'koi8u': 'koi8_u', |
| 36 | + 'latin1': 'cp1252', # MySQL's latin1 corresponds to Windows-1252 |
| 37 | + 'latin2': 'iso8859_2', |
| 38 | + 'latin5': 'iso8859_9', |
| 39 | + 'latin7': 'iso8859_13', |
| 40 | + 'macce': 'mac_latin2', |
| 41 | + 'macroman': 'mac_roman', |
| 42 | + 'sjis': 'shift_jis', |
| 43 | + 'swe7': None, # SWE7 is not directly supported in Python |
| 44 | + 'tis620': 'tis_620', |
| 45 | + 'ucs2': 'utf_16', # UCS-2 can be mapped to UTF-16 |
| 46 | + 'ujis': 'euc_jp', |
| 47 | + 'utf16': 'utf_16', |
| 48 | + 'utf16le': 'utf_16_le', |
| 49 | + 'utf32': 'utf_32', |
| 50 | + 'utf8mb3': 'utf_8', # Both utf8mb3 and utf8mb4 can be mapped to UTF-8 |
| 51 | + 'utf8mb4': 'utf_8', |
| 52 | +} |
| 53 | + |
| 54 | + |
10 | 55 | def convert_bytes(obj): |
11 | 56 | if isinstance(obj, dict): |
12 | 57 | new_obj = {} |
@@ -272,7 +317,7 @@ def convert_record( |
272 | 317 | 'text' in mysql_field_type or 'char' in mysql_field_type |
273 | 318 | ): |
274 | 319 | if isinstance(clickhouse_field_value, bytes): |
275 | | - charset = mysql_structure.charset or 'utf-8' |
| 320 | + charset = mysql_structure.charset_python |
276 | 321 | clickhouse_field_value = clickhouse_field_value.decode(charset) |
277 | 322 |
|
278 | 323 | if 'point' in mysql_field_type: |
@@ -541,8 +586,10 @@ def parse_mysql_table_structure(self, create_statement, required_table_name=None |
541 | 586 | prev_prev_token = prev_token |
542 | 587 | prev_token = curr_token |
543 | 588 |
|
544 | | - if structure.charset.startswith('utf8'): |
545 | | - structure.charset = 'utf-8' |
| 589 | + structure.charset_python = 'utf-8' |
| 590 | + |
| 591 | + if structure.charset: |
| 592 | + structure.charset_python = CHARSET_MYSQL_TO_PYTHON[structure.charset] |
546 | 593 |
|
547 | 594 | for line in inner_tokens: |
548 | 595 | if line.lower().startswith('unique key'): |
|
0 commit comments