Skip to content

Commit 9ed5730

Browse files
authored
Merge pull request #3078 from dolthub/elianddb/8893-charset-validation-fix
dolthub/dolt#8893 - Fix charset validation to match MySQL behavior for issue #8893
2 parents 836cad8 + 64100d3 commit 9ed5730

File tree

7 files changed

+497
-38
lines changed

7 files changed

+497
-38
lines changed

enginetest/enginetests.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4523,14 +4523,14 @@ func TestPreparedInsert(t *testing.T, harness Harness) {
45234523
Bindings: map[string]sqlparser.Expr{
45244524
"v1": mustBuildBindVariable([]byte{0x99, 0x98, 0x97}),
45254525
},
4526-
ExpectedErrStr: "invalid string for charset utf8mb4: '[153 152 151]'",
4526+
ExpectedErrStr: "Incorrect string value: '\\x99\\x98\\x97' for column 'v1' at row 1",
45274527
},
45284528
{
45294529
Query: "INSERT INTO test VALUES (?);",
45304530
Bindings: map[string]sqlparser.Expr{
45314531
"v1": mustBuildBindVariable(string([]byte{0x99, 0x98, 0x97})),
45324532
},
4533-
ExpectedErrStr: "invalid string for charset utf8mb4: '[153 152 151]'",
4533+
ExpectedErrStr: "Incorrect string value: '\\x99\\x98\\x97' for column 'v1' at row 1",
45344534
},
45354535
{
45364536
Query: "INSERT INTO test2 VALUES (?);",

enginetest/queries/script_queries.go

Lines changed: 355 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7140,15 +7140,15 @@ where
71407140
Assertions: []ScriptTestAssertion{
71417141
{
71427142
Query: "insert into t(c) values (X'9876543210');",
7143-
ExpectedErrStr: "invalid string for charset utf8mb4: '[152 118 84 50 16]'",
7143+
ExpectedErrStr: "Incorrect string value: '\\x98vT2\\x10' for column 'c' at row 1",
71447144
},
71457145
{
71467146
Query: "insert into t(v) values (X'9876543210');",
7147-
ExpectedErrStr: "invalid string for charset utf8mb4: '[152 118 84 50 16]'",
7147+
ExpectedErrStr: "Incorrect string value: '\\x98vT2\\x10' for column 'v' at row 1",
71487148
},
71497149
{
71507150
Query: "insert into t(txt) values (X'9876543210');",
7151-
ExpectedErrStr: "invalid string for charset utf8mb4: '[152 118 84 50 16]'",
7151+
ExpectedErrStr: "Incorrect string value: '\\x98vT2\\x10' for column 'txt' at row 1",
71527152
},
71537153
{
71547154
Query: "insert into t(b) values (X'9876543210');",
@@ -7164,6 +7164,358 @@ where
71647164
},
71657165
},
71667166
},
7167+
{
7168+
Name: "charset validation strict vs non-strict mode",
7169+
Dialect: "mysql",
7170+
SetUpScript: []string{
7171+
"create table charset_test (c char(10), v varchar(10), txt text) character set utf8mb4;",
7172+
},
7173+
Assertions: []ScriptTestAssertion{
7174+
{
7175+
Query: "set sql_mode = 'STRICT_TRANS_TABLES';",
7176+
Expected: []sql.Row{{types.OkResult{RowsAffected: 0}}},
7177+
},
7178+
{
7179+
Query: "insert into charset_test(c) values (UNHEX('446F6C744C6162AE'));",
7180+
ExpectedErrStr: "Incorrect string value: '\\xAE' for column 'c' at row 1",
7181+
},
7182+
{
7183+
Query: "insert into charset_test(v) values (UNHEX('446F6C744C6162AE'));",
7184+
ExpectedErrStr: "Incorrect string value: '\\xAE' for column 'v' at row 1",
7185+
},
7186+
{
7187+
Query: "insert into charset_test(txt) values (UNHEX('446F6C744C6162AE'));",
7188+
ExpectedErrStr: "Incorrect string value: '\\xAE' for column 'txt' at row 1",
7189+
},
7190+
{
7191+
Query: "set sql_mode = '';",
7192+
Expected: []sql.Row{{types.OkResult{RowsAffected: 0}}},
7193+
},
7194+
{
7195+
Query: "insert into charset_test(c) values (UNHEX('446F6C744C6162AE'));",
7196+
Expected: []sql.Row{
7197+
{types.OkResult{RowsAffected: 1}},
7198+
},
7199+
},
7200+
{
7201+
Query: "insert into charset_test(v) values (UNHEX('446F6C744C6162AE'));",
7202+
Expected: []sql.Row{
7203+
{types.OkResult{RowsAffected: 1}},
7204+
},
7205+
},
7206+
{
7207+
Query: "insert into charset_test(txt) values (UNHEX('446F6C744C6162AE'));",
7208+
Expected: []sql.Row{
7209+
{types.OkResult{RowsAffected: 1}},
7210+
},
7211+
},
7212+
{
7213+
Query: "select HEX(c), LENGTH(c) from charset_test where c is not null;",
7214+
Expected: []sql.Row{
7215+
{"446F6C744C6162", 7},
7216+
},
7217+
},
7218+
{
7219+
Query: "select HEX(v), LENGTH(v) from charset_test where v is not null;",
7220+
Expected: []sql.Row{
7221+
{"446F6C744C6162", 7},
7222+
},
7223+
},
7224+
{
7225+
Query: "select HEX(txt), LENGTH(txt) from charset_test where txt is not null;",
7226+
Expected: []sql.Row{
7227+
{"446F6C744C6162", 7},
7228+
},
7229+
},
7230+
},
7231+
},
7232+
{
7233+
Name: "charset validation issue #8893 - customer scenario",
7234+
Dialect: "mysql",
7235+
SetUpScript: []string{
7236+
"create table products (id int primary key, name text character set utf8mb4);",
7237+
},
7238+
Assertions: []ScriptTestAssertion{
7239+
// Test charset validation with invalid UTF-8 data
7240+
{
7241+
Query: "insert into products values (1, UNHEX('446F6C744C6162AE'));", // "DoltLab" + invalid byte 0xAE
7242+
ExpectedErrStr: "Incorrect string value: '\\xAE' for column 'name' at row 1",
7243+
},
7244+
// Test non-strict mode truncation behavior
7245+
{
7246+
Query: "set sql_mode = '';",
7247+
Expected: []sql.Row{{types.OkResult{RowsAffected: 0}}},
7248+
},
7249+
{
7250+
Query: "insert into products values (1, UNHEX('446F6C744C6162AE'));", // Now succeeds with truncation
7251+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7252+
},
7253+
// Verify data was truncated at invalid byte (MySQL behavior)
7254+
{
7255+
Query: "select id, name, HEX(name) from products;",
7256+
Expected: []sql.Row{
7257+
{1, "DoltLab", "446F6C744C6162"}, // Invalid byte 0xAE was truncated
7258+
},
7259+
},
7260+
// Customer can now query and work with the data
7261+
{
7262+
Query: "select id, name from products where name like '%Lab%';",
7263+
Expected: []sql.Row{
7264+
{1, "DoltLab"},
7265+
},
7266+
},
7267+
},
7268+
},
7269+
{
7270+
Name: "charset validation edge cases - formatInvalidByteForError testing",
7271+
Dialect: "mysql",
7272+
SetUpScript: []string{
7273+
"create table charset_edge_test (c char(10), v varchar(20), t text);",
7274+
},
7275+
Assertions: []ScriptTestAssertion{
7276+
// STRICT MODE TESTS
7277+
{
7278+
Query: "set sql_mode = 'STRICT_TRANS_TABLES';",
7279+
Expected: []sql.Row{{types.OkResult{RowsAffected: 0}}},
7280+
},
7281+
// Single invalid byte (0xAE)
7282+
{
7283+
Query: "insert into charset_edge_test(c) values (UNHEX('AE'));",
7284+
ExpectedErrStr: "Incorrect string value: '\\xAE' for column 'c' at row 1",
7285+
},
7286+
{
7287+
Query: "insert into charset_edge_test(v) values (UNHEX('AE'));",
7288+
ExpectedErrStr: "Incorrect string value: '\\xAE' for column 'v' at row 1",
7289+
},
7290+
{
7291+
Query: "insert into charset_edge_test(t) values (UNHEX('AE'));",
7292+
ExpectedErrStr: "Incorrect string value: '\\xAE' for column 't' at row 1",
7293+
},
7294+
// Multiple invalid bytes
7295+
{
7296+
Query: "insert into charset_edge_test(c) values (UNHEX('AEAEAE'));",
7297+
ExpectedErrStr: "Incorrect string value: '\\xAE\\xAE\\xAE' for column 'c' at row 1",
7298+
},
7299+
// Overlong sequences
7300+
{
7301+
Query: "insert into charset_edge_test(c) values (UNHEX('C0C1'));",
7302+
ExpectedErrStr: "Incorrect string value: '\\xC0\\xC1' for column 'c' at row 1",
7303+
},
7304+
// Invalid bytes 0xFE, 0xFF
7305+
{
7306+
Query: "insert into charset_edge_test(c) values (UNHEX('FE'));",
7307+
ExpectedErrStr: "Incorrect string value: '\\xFE' for column 'c' at row 1",
7308+
},
7309+
{
7310+
Query: "insert into charset_edge_test(c) values (UNHEX('FF'));",
7311+
ExpectedErrStr: "Incorrect string value: '\\xFF' for column 'c' at row 1",
7312+
},
7313+
// Surrogate pairs
7314+
{
7315+
Query: "insert into charset_edge_test(c) values (UNHEX('EDA080'));",
7316+
ExpectedErrStr: "Incorrect string value: '\\xED\\xA0\\x80' for column 'c' at row 1",
7317+
},
7318+
{
7319+
Query: "insert into charset_edge_test(c) values (UNHEX('EDBFBF'));",
7320+
ExpectedErrStr: "Incorrect string value: '\\xED\\xBF\\xBF' for column 'c' at row 1",
7321+
},
7322+
// More overlong sequences
7323+
{
7324+
Query: "insert into charset_edge_test(c) values (UNHEX('C080'));",
7325+
ExpectedErrStr: "Incorrect string value: '\\xC0\\x80' for column 'c' at row 1",
7326+
},
7327+
{
7328+
Query: "insert into charset_edge_test(c) values (UNHEX('E08080'));",
7329+
ExpectedErrStr: "Incorrect string value: '\\xE0\\x80\\x80' for column 'c' at row 1",
7330+
},
7331+
{
7332+
Query: "insert into charset_edge_test(c) values (UNHEX('F0808080'));",
7333+
ExpectedErrStr: "Incorrect string value: '\\xF0\\x80\\x80\\x80' for column 'c' at row 1",
7334+
},
7335+
// Out of range (beyond U+10FFFF)
7336+
{
7337+
Query: "insert into charset_edge_test(c) values (UNHEX('F4908080'));",
7338+
ExpectedErrStr: "Incorrect string value: '\\xF4\\x90\\x80\\x80' for column 'c' at row 1",
7339+
},
7340+
// Continuation bytes without start byte
7341+
{
7342+
Query: "insert into charset_edge_test(c) values (UNHEX('80'));",
7343+
ExpectedErrStr: "Incorrect string value: '\\x80' for column 'c' at row 1",
7344+
},
7345+
{
7346+
Query: "insert into charset_edge_test(c) values (UNHEX('BF'));",
7347+
ExpectedErrStr: "Incorrect string value: '\\xBF' for column 'c' at row 1",
7348+
},
7349+
// Incomplete sequences
7350+
{
7351+
Query: "insert into charset_edge_test(c) values (UNHEX('C2'));",
7352+
ExpectedErrStr: "Incorrect string value: '\\xC2' for column 'c' at row 1",
7353+
},
7354+
{
7355+
Query: "insert into charset_edge_test(c) values (UNHEX('E0A0'));",
7356+
ExpectedErrStr: "Incorrect string value: '\\xE0\\xA0' for column 'c' at row 1",
7357+
},
7358+
{
7359+
Query: "insert into charset_edge_test(c) values (UNHEX('F09080'));",
7360+
ExpectedErrStr: "Incorrect string value: '\\xF0\\x90\\x80' for column 'c' at row 1",
7361+
},
7362+
// Long sequence (tests truncation with ...)
7363+
{
7364+
Query: "insert into charset_edge_test(c) values (UNHEX('999897969594939291'));",
7365+
ExpectedErrStr: "Incorrect string value: '\\x99\\x98\\x97\\x96\\x95\\x94...' for column 'c' at row 1",
7366+
},
7367+
// Valid UTF-8 with invalid bytes
7368+
{
7369+
Query: "insert into charset_edge_test(c) values (UNHEX('446F6C744C6162AE'));",
7370+
ExpectedErrStr: "Incorrect string value: '\\xAE' for column 'c' at row 1",
7371+
},
7372+
7373+
// NON-STRICT MODE TESTS (should truncate)
7374+
{
7375+
Query: "set sql_mode = '';",
7376+
Expected: []sql.Row{{types.OkResult{RowsAffected: 0}}},
7377+
},
7378+
{
7379+
Query: "insert into charset_edge_test(c) values (UNHEX('446F6C744C6162AE'));",
7380+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7381+
},
7382+
{
7383+
Query: "insert into charset_edge_test(v) values (UNHEX('48656C6C6FC0'));",
7384+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7385+
},
7386+
{
7387+
Query: "insert into charset_edge_test(t) values (UNHEX('54657374FF'));",
7388+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7389+
},
7390+
// Verify truncated data
7391+
{
7392+
Query: "select HEX(c), LENGTH(c) from charset_edge_test where c is not null;",
7393+
Expected: []sql.Row{
7394+
{"446F6C744C6162", 7},
7395+
},
7396+
},
7397+
{
7398+
Query: "select HEX(v), LENGTH(v) from charset_edge_test where v is not null;",
7399+
Expected: []sql.Row{
7400+
{"48656C6C6F", 5},
7401+
},
7402+
},
7403+
{
7404+
Query: "select HEX(t), LENGTH(t) from charset_edge_test where t is not null;",
7405+
Expected: []sql.Row{
7406+
{"54657374", 4},
7407+
},
7408+
},
7409+
},
7410+
},
7411+
{
7412+
Name: "charset validation ASCII range tests",
7413+
Dialect: "mysql",
7414+
SetUpScript: []string{
7415+
"create table ascii_test (c char(10), v varchar(20), t text);",
7416+
},
7417+
Assertions: []ScriptTestAssertion{
7418+
{
7419+
Query: "set sql_mode = 'STRICT_TRANS_TABLES';",
7420+
Expected: []sql.Row{{types.OkResult{RowsAffected: 0}}},
7421+
},
7422+
// ASCII range 0x00-0x7F
7423+
{
7424+
Query: "insert into ascii_test(c) values (UNHEX('00'));",
7425+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7426+
},
7427+
{
7428+
Query: "insert into ascii_test(c) values (UNHEX('20'));",
7429+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7430+
},
7431+
{
7432+
Query: "insert into ascii_test(c) values (UNHEX('41'));",
7433+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7434+
},
7435+
{
7436+
Query: "insert into ascii_test(c) values (UNHEX('7F'));",
7437+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7438+
},
7439+
{
7440+
Query: "insert into ascii_test(v) values (UNHEX('48656C6C6F'));", // "Hello"
7441+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7442+
},
7443+
{
7444+
Query: "insert into ascii_test(t) values (UNHEX('00207F41'));",
7445+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7446+
},
7447+
// Verify ASCII data
7448+
{
7449+
Query: "select HEX(c), LENGTH(c) from ascii_test where c is not null order by c;",
7450+
Expected: []sql.Row{
7451+
{"00", 1},
7452+
{"20", 1},
7453+
{"41", 1},
7454+
{"7F", 1},
7455+
},
7456+
},
7457+
{
7458+
Query: "select HEX(v), LENGTH(v) from ascii_test where v is not null;",
7459+
Expected: []sql.Row{
7460+
{"48656C6C6F", 5}, // "Hello"
7461+
},
7462+
},
7463+
{
7464+
Query: "select HEX(t), LENGTH(t) from ascii_test where t is not null;",
7465+
Expected: []sql.Row{
7466+
{"00207F41", 4}, // NULL + SPACE + DEL + A
7467+
},
7468+
},
7469+
// Boundary cases
7470+
{
7471+
Query: "insert into ascii_test(c) values (UNHEX('7E'));", // 0x7E is valid ASCII
7472+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7473+
},
7474+
{
7475+
Query: "insert into ascii_test(c) values (UNHEX('81'));", // 0x81 is invalid
7476+
ExpectedErrStr: "Incorrect string value: '\\x81' for column 'c' at row 1",
7477+
},
7478+
// Mixed ASCII and invalid (non-strict mode)
7479+
{
7480+
Query: "set sql_mode = '';", // Non-strict mode
7481+
Expected: []sql.Row{{types.OkResult{RowsAffected: 0}}},
7482+
},
7483+
{
7484+
Query: "insert into ascii_test(c) values (UNHEX('41424380'));", // ABC + 0x80 (invalid)
7485+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7486+
},
7487+
// Verify truncation
7488+
{
7489+
Query: "select HEX(c), LENGTH(c) from ascii_test where HEX(c) = '414243';",
7490+
Expected: []sql.Row{
7491+
{"414243", 3}, // "ABC" - truncated at invalid byte
7492+
},
7493+
},
7494+
// Valid UTF-8 sequences
7495+
{
7496+
Query: "set sql_mode = 'STRICT_TRANS_TABLES';", // Back to strict mode
7497+
Expected: []sql.Row{{types.OkResult{RowsAffected: 0}}},
7498+
},
7499+
{
7500+
Query: "insert into ascii_test(c) values (UNHEX('C3A9'));", // é (2-byte UTF-8)
7501+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7502+
},
7503+
{
7504+
Query: "insert into ascii_test(c) values (UNHEX('E282AC'));", // € (3-byte UTF-8)
7505+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7506+
},
7507+
{
7508+
Query: "insert into ascii_test(c) values (UNHEX('F09D849E'));", // 𝄞 (4-byte UTF-8)
7509+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7510+
},
7511+
// Function boundary constants (asciiMin=32, asciiMax=127)
7512+
{
7513+
Query: "insert into ascii_test(c) values (UNHEX('1F'));", // ASCII 31 (below asciiMin=32) - valid ASCII but non-printable
7514+
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
7515+
},
7516+
// Note: UNHEX('80') test is covered in edge cases test above
7517+
},
7518+
},
71677519
{
71687520
Name: "unix_timestamp script tests",
71697521
Dialect: "mysql",

0 commit comments

Comments
 (0)