From 1b6b2ba222075e02d52091412070ddc885e0e9e8 Mon Sep 17 00:00:00 2001 From: elianddb Date: Thu, 16 Oct 2025 13:48:50 -0700 Subject: [PATCH 1/6] add test --- enginetest/memory_engine_test.go | 28 +++++++++++++++++----------- enginetest/testdata/data9969 | 1 + 2 files changed, 18 insertions(+), 11 deletions(-) create mode 100644 enginetest/testdata/data9969 diff --git a/enginetest/memory_engine_test.go b/enginetest/memory_engine_test.go index f1ec7b45d0..583778f96d 100644 --- a/enginetest/memory_engine_test.go +++ b/enginetest/memory_engine_test.go @@ -200,22 +200,28 @@ func TestSingleQueryPrepared(t *testing.T) { // Convenience test for debugging a single query. Unskip and set to the desired query. func TestSingleScript(t *testing.T) { - t.Skip() + //t.Skip() var scripts = []queries.ScriptTest{ { - Name: "AS OF propagates to nested CALLs", - SetUpScript: []string{}, + Name: "Dolt diff query returns correct tables (regression 1.59.18)", + SetUpScript: []string{ + "SET GLOBAL local_infile=1;", + `CREATE TABLE my_table ( + id int NOT NULL, + txt varchar(16) NOT NULL, + PRIMARY KEY (id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;`, + `LOAD DATA INFILE 'C:/Users/Elian/dolt_workspace/db/9969/data' + INTO TABLE my_table + FIELDS TERMINATED BY ',' ENCLOSED BY '"' ESCAPED BY '"' + LINES TERMINATED BY '\n' + (id, txt);`, + }, Assertions: []queries.ScriptTestAssertion{ { - Query: "create procedure create_proc() create table t (i int primary key, j int);", - Expected: []sql.Row{ - {types.NewOkResult(0)}, - }, - }, - { - Query: "call create_proc()", + Query: `SELECT * FROM my_table LIMIT 1`, Expected: []sql.Row{ - {types.NewOkResult(0)}, + {1, "foo,bar"}, }, }, }, diff --git a/enginetest/testdata/data9969 b/enginetest/testdata/data9969 new file mode 100644 index 0000000000..1e1eb13033 --- /dev/null +++ b/enginetest/testdata/data9969 @@ -0,0 +1 @@ +"1","foo,bar" \ No newline at end of file From 0b3ea4b9f9787aadce309bc89b854570bee9300a Mon Sep 17 00:00:00 2001 From: elianddb Date: Thu, 16 Oct 2025 16:41:20 -0700 Subject: [PATCH 2/6] impl ch-by-ch parsing for quoted fields --- enginetest/memory_engine_test.go | 28 ++-- enginetest/queries/load_queries.go | 63 +++++++++ .../testdata/{data9969 => loaddata_9969.dat} | 0 sql/rowexec/ddl_iters.go | 128 +++++++++++++----- 4 files changed, 167 insertions(+), 52 deletions(-) rename enginetest/testdata/{data9969 => loaddata_9969.dat} (100%) diff --git a/enginetest/memory_engine_test.go b/enginetest/memory_engine_test.go index 583778f96d..f1ec7b45d0 100644 --- a/enginetest/memory_engine_test.go +++ b/enginetest/memory_engine_test.go @@ -200,28 +200,22 @@ func TestSingleQueryPrepared(t *testing.T) { // Convenience test for debugging a single query. Unskip and set to the desired query. func TestSingleScript(t *testing.T) { - //t.Skip() + t.Skip() var scripts = []queries.ScriptTest{ { - Name: "Dolt diff query returns correct tables (regression 1.59.18)", - SetUpScript: []string{ - "SET GLOBAL local_infile=1;", - `CREATE TABLE my_table ( - id int NOT NULL, - txt varchar(16) NOT NULL, - PRIMARY KEY (id) - ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;`, - `LOAD DATA INFILE 'C:/Users/Elian/dolt_workspace/db/9969/data' - INTO TABLE my_table - FIELDS TERMINATED BY ',' ENCLOSED BY '"' ESCAPED BY '"' - LINES TERMINATED BY '\n' - (id, txt);`, - }, + Name: "AS OF propagates to nested CALLs", + SetUpScript: []string{}, Assertions: []queries.ScriptTestAssertion{ { - Query: `SELECT * FROM my_table LIMIT 1`, + Query: "create procedure create_proc() create table t (i int primary key, j int);", + Expected: []sql.Row{ + {types.NewOkResult(0)}, + }, + }, + { + Query: "call create_proc()", Expected: []sql.Row{ - {1, "foo,bar"}, + {types.NewOkResult(0)}, }, }, }, diff --git a/enginetest/queries/load_queries.go b/enginetest/queries/load_queries.go index 7006f25b8c..15534afbe2 100644 --- a/enginetest/queries/load_queries.go +++ b/enginetest/queries/load_queries.go @@ -24,6 +24,68 @@ import ( ) var LoadDataScripts = []ScriptTest{ + { + // https://github.com/dolthub/dolt/issues/9969 + Name: "LOAD DATA with ENCLOSED BY and ESCAPED BY parsing", + SetUpScript: []string{ + "create table t1(pk int primary key, c1 longtext)", + "LOAD DATA INFILE './testdata/loaddata_9969.dat' INTO TABLE t1 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\"'", + "create table t2(pk int primary key, c1 longtext)", + "LOAD DATA INFILE './testdata/loaddata_escape.dat' INTO TABLE t2 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\\\\'", + "create table t3(a varchar(20), b varchar(20))", + "LOAD DATA INFILE './testdata/loaddata_enclosed.dat' INTO TABLE t3 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\"'", + "create table t4(a varchar(20), b varchar(20))", + "LOAD DATA INFILE './testdata/loaddata_mixed_escapes.dat' INTO TABLE t4 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\\\\'", + "create table t5(a text, b text)", + "LOAD DATA INFILE './testdata/loaddata_single_quotes.dat' INTO TABLE t5 FIELDS TERMINATED BY ',' ENCLOSED BY ''''", + "create table t6(pk int, a varchar(20), b varchar(20))", + "LOAD DATA INFILE './testdata/loaddata_nulls.dat' INTO TABLE t6 FIELDS TERMINATED BY ','", + }, + Assertions: []ScriptTestAssertion{ + { + Query: "select * from t1", + Expected: []sql.Row{{1, "foo,bar"}}, + }, + { + Query: "select * from t2", + Expected: []sql.Row{{1, "foo,bar"}}, + }, + { + Query: "select * from t3 ORDER BY a", + Expected: []sql.Row{ + {"a\"b", "cd\"ef"}, + {"field1", "field2"}, + {"foo,bar", "baz,qux"}, + }, + }, + { + Query: "select * from t4", + Expected: []sql.Row{ + {nil, "\x1A"}, + {"a,b", "c,d"}, + {"hello\nworld", "foo\tbar"}, + }, + }, + { + Query: "select * from t5", // order by a breaks + Expected: []sql.Row{ + {"Field A", "Field B"}, + {"Field 1", "Field 2"}, + {"Field 3", "Field 4"}, + {"Field 5", "Field 6"}, + }, + }, + { + Query: "select * from t6 ORDER BY pk", + Expected: []sql.Row{ + {1, "hello", "world"}, + {2, nil, "test"}, + {3, "", "empty"}, + {4, nil, nil}, + }, + }, + }, + }, { Name: "LOAD DATA applies column defaults when \\N provided", SetUpScript: []string{ @@ -128,6 +190,7 @@ var LoadDataScripts = []ScriptTest{ }, }, }, + // https://github.com/dolthub/dolt/issues/9969 { Name: "Load JSON data. EnclosedBy and EscapedBy are the same.", SetUpScript: []string{ diff --git a/enginetest/testdata/data9969 b/enginetest/testdata/loaddata_9969.dat similarity index 100% rename from enginetest/testdata/data9969 rename to enginetest/testdata/loaddata_9969.dat diff --git a/sql/rowexec/ddl_iters.go b/sql/rowexec/ddl_iters.go index ecbc16be97..d405710381 100644 --- a/sql/rowexec/ddl_iters.go +++ b/sql/rowexec/ddl_iters.go @@ -131,54 +131,112 @@ func (l *loadDataIter) parseLinePrefix(line string) string { } func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Expression, error) { - // Step 1. Start by Searching for prefix if there is one + // Start by searching for prefix if there is one line = l.parseLinePrefix(line) if line == "" { return nil, nil } - // Step 2: Split the lines into fields given the delim - fields := strings.Split(line, l.fieldsTerminatedBy) + // Split the line into fields. When ENCLOSED BY is specified, fields must be parsed + // character-by-character to respect quoted fields that may contain the field terminator. + var fields []string + if l.fieldsEnclosedBy == "" { + fields = strings.Split(line, l.fieldsTerminatedBy) + } else { + var currentField strings.Builder + inEnclosure := false + encChar := l.fieldsEnclosedBy[0] + escChar := byte(0) + if l.fieldsEscapedBy != "" { + escChar = l.fieldsEscapedBy[0] + } + termLen := len(l.fieldsTerminatedBy) + + for i := 0; i < len(line); i++ { + c := line[i] + + // Handle enclosure character + if c == encChar { + if inEnclosure { + // Check for doubled enclosure (escape mechanism when encChar == escChar) + if i+1 < len(line) && line[i+1] == encChar { + currentField.WriteByte(encChar) + i++ + continue + } + inEnclosure = false + continue + } + if currentField.Len() == 0 { + inEnclosure = true + continue + } + } - // Step 3: Go through each field and see if it was enclosed by something - // TODO: Support the OPTIONALLY parameter. - if l.fieldsEnclosedBy != "" { - for i, field := range fields { - if field[0] == l.fieldsEnclosedBy[0] && field[len(field)-1] == l.fieldsEnclosedBy[0] { - fields[i] = field[1 : len(field)-1] - } else { - return nil, fmt.Errorf("error: field not properly enclosed") + // Handle escape character (only when different from enclosure character) + if escChar != 0 && escChar != encChar && c == escChar && i+1 < len(line) { + currentField.WriteByte(c) + i++ + currentField.WriteByte(line[i]) + continue + } + + // Handle field terminator (only outside enclosures) + if !inEnclosure && i+termLen <= len(line) && line[i:i+termLen] == l.fieldsTerminatedBy { + fields = append(fields, currentField.String()) + currentField.Reset() + i += termLen - 1 + continue } + + currentField.WriteByte(c) + } + + fields = append(fields, currentField.String()) + if !l.fieldsEnclosedByOpt && inEnclosure { + return nil, fmt.Errorf("error: unterminated enclosed field") } } - // Step 4: Handle the ESCAPED BY parameter. - if l.fieldsEscapedBy != "" { + // Handle ESCAPED BY parameter for special sequences like \N, \Z, \0, \n, \t, etc. + // When ESCAPED BY equals ENCLOSED BY, escaping was already handled via doubling. + if l.fieldsEscapedBy != "" && l.fieldsEscapedBy != l.fieldsEnclosedBy { + escByte := l.fieldsEscapedBy[0] for i, field := range fields { - if field == "\\N" { - fields[i] = "NULL" - } else if field == "\\Z" { - fields[i] = fmt.Sprintf("%c", 26) // ASCII 26 - } else if field == "\\0" { - fields[i] = fmt.Sprintf("%c", 0) // ASCII 0 - } else { - // The character immediately following the escaped character remains untouched, even if it is the same - // as the escape character - newField := make([]byte, 0, len(field)) - for cIdx := 0; cIdx < len(field); cIdx++ { - c := field[cIdx] - // skip over escaped character, but always add the following character - if c == l.fieldsEscapedBy[0] { - cIdx += 1 - if cIdx < len(field) { - newField = append(newField, c) - } - continue - } - newField = append(newField, c) + if !strings.ContainsRune(field, rune(escByte)) { + continue + } + + newField := make([]byte, 0, len(field)) + for j := 0; j < len(field); j++ { + if field[j] != escByte || j+1 >= len(field) { + newField = append(newField, field[j]) + continue + } + + j++ + switch field[j] { + case 'N': + fields[i] = "NULL" + goto nextField + case 'Z': + newField = append(newField, 26) + case '0': + newField = append(newField, 0) + case 'n': + newField = append(newField, '\n') + case 't': + newField = append(newField, '\t') + case 'r': + newField = append(newField, '\r') + case 'b': + newField = append(newField, '\b') + default: + newField = append(newField, field[j]) } - fields[i] = string(newField) } + fields[i] = string(newField) + nextField: } } From 11b740d857549033c34e473f2bd229749c59204b Mon Sep 17 00:00:00 2001 From: elianddb Date: Thu, 16 Oct 2025 16:43:26 -0700 Subject: [PATCH 3/6] add loaddata test files --- enginetest/testdata/loaddata_enclosed.dat | 3 +++ enginetest/testdata/loaddata_escape.dat | 1 + enginetest/testdata/loaddata_mixed_escapes.dat | 3 +++ enginetest/testdata/loaddata_nulls.dat | 4 ++++ enginetest/testdata/loaddata_single_quotes.dat | 4 ++++ 5 files changed, 15 insertions(+) create mode 100644 enginetest/testdata/loaddata_enclosed.dat create mode 100644 enginetest/testdata/loaddata_escape.dat create mode 100644 enginetest/testdata/loaddata_mixed_escapes.dat create mode 100644 enginetest/testdata/loaddata_nulls.dat create mode 100644 enginetest/testdata/loaddata_single_quotes.dat diff --git a/enginetest/testdata/loaddata_enclosed.dat b/enginetest/testdata/loaddata_enclosed.dat new file mode 100644 index 0000000000..f00e9d5897 --- /dev/null +++ b/enginetest/testdata/loaddata_enclosed.dat @@ -0,0 +1,3 @@ +"field1","field2" +"a""b","cd""ef" +"foo,bar","baz,qux" \ No newline at end of file diff --git a/enginetest/testdata/loaddata_escape.dat b/enginetest/testdata/loaddata_escape.dat new file mode 100644 index 0000000000..395e5e45d3 --- /dev/null +++ b/enginetest/testdata/loaddata_escape.dat @@ -0,0 +1 @@ +"1","foo\,bar" \ No newline at end of file diff --git a/enginetest/testdata/loaddata_mixed_escapes.dat b/enginetest/testdata/loaddata_mixed_escapes.dat new file mode 100644 index 0000000000..5da4c8ba6a --- /dev/null +++ b/enginetest/testdata/loaddata_mixed_escapes.dat @@ -0,0 +1,3 @@ +"hello\nworld","foo\tbar" +"a\,b","c\,d" +"\N","\Z" \ No newline at end of file diff --git a/enginetest/testdata/loaddata_nulls.dat b/enginetest/testdata/loaddata_nulls.dat new file mode 100644 index 0000000000..df385b2d69 --- /dev/null +++ b/enginetest/testdata/loaddata_nulls.dat @@ -0,0 +1,4 @@ +1,hello,world +2,\N,test +3,,empty +4,\N,\N \ No newline at end of file diff --git a/enginetest/testdata/loaddata_single_quotes.dat b/enginetest/testdata/loaddata_single_quotes.dat new file mode 100644 index 0000000000..7dcedab6f8 --- /dev/null +++ b/enginetest/testdata/loaddata_single_quotes.dat @@ -0,0 +1,4 @@ +Field A,'Field B' +Field 1,'Field 2' +Field 3,'Field 4' +'Field 5','Field 6' \ No newline at end of file From 44b3c00f9e24c33fd3a09225940c0803838310fa Mon Sep 17 00:00:00 2001 From: elianddb Date: Thu, 16 Oct 2025 16:50:53 -0700 Subject: [PATCH 4/6] amend .gitattributes with loaddata test files --- .gitattributes | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/.gitattributes b/.gitattributes index 6168d3d095..b75c1fc47f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,12 +1,18 @@ -enginetest/testdata/test1.txt binary -enginetest/testdata/test2.csv binary -enginetest/testdata/test3.csv binary -enginetest/testdata/test3backwards.csv binary -enginetest/testdata/test4.txt binary -enginetest/testdata/test5.txt binary -enginetest/testdata/test6.csv binary -enginetest/testdata/test7.txt binary -enginetest/testdata/test8.txt binary -enginetest/testdata/test9.txt binary -enginetest/testdata/test10.txt binary -enginetest/testdata/simple_json.txt binary \ No newline at end of file +enginetest/testdata/test1.txt binary +enginetest/testdata/test2.csv binary +enginetest/testdata/test3.csv binary +enginetest/testdata/test3backwards.csv binary +enginetest/testdata/test4.txt binary +enginetest/testdata/test5.txt binary +enginetest/testdata/test6.csv binary +enginetest/testdata/test7.txt binary +enginetest/testdata/test8.txt binary +enginetest/testdata/test9.txt binary +enginetest/testdata/test10.txt binary +enginetest/testdata/simple_json.txt binary +enginetest/testdata/loaddata_9969.dat binary +enginetest/testdata/loaddata_escape.dat binary +enginetest/testdata/loaddata_enclosed.dat binary +enginetest/testdata/loaddata_single_quotes.dat binary +enginetest/testdata/loaddata_nulls.dat binary +enginetest/testdata/loaddata_mixed_escapes.dat binary \ No newline at end of file From b9b32d35c138272a1b78eccb3f1d090e8a6a2ec8 Mon Sep 17 00:00:00 2001 From: elianddb Date: Fri, 17 Oct 2025 00:08:15 -0700 Subject: [PATCH 5/6] amend to use ch-by-ch only --- enginetest/queries/load_queries.go | 1 - sql/rowexec/ddl_iters.go | 144 +++++++++++------------------ 2 files changed, 56 insertions(+), 89 deletions(-) diff --git a/enginetest/queries/load_queries.go b/enginetest/queries/load_queries.go index 15534afbe2..e8ec6ad173 100644 --- a/enginetest/queries/load_queries.go +++ b/enginetest/queries/load_queries.go @@ -190,7 +190,6 @@ var LoadDataScripts = []ScriptTest{ }, }, }, - // https://github.com/dolthub/dolt/issues/9969 { Name: "Load JSON data. EnclosedBy and EscapedBy are the same.", SetUpScript: []string{ diff --git a/sql/rowexec/ddl_iters.go b/sql/rowexec/ddl_iters.go index d405710381..16be51d784 100644 --- a/sql/rowexec/ddl_iters.go +++ b/sql/rowexec/ddl_iters.go @@ -137,107 +137,75 @@ func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Express return nil, nil } - // Split the line into fields. When ENCLOSED BY is specified, fields must be parsed - // character-by-character to respect quoted fields that may contain the field terminator. + // line is parsed character-by-character to respect enclosed fields that may contain the field terminator var fields []string - if l.fieldsEnclosedBy == "" { - fields = strings.Split(line, l.fieldsTerminatedBy) - } else { - var currentField strings.Builder - inEnclosure := false - encChar := l.fieldsEnclosedBy[0] - escChar := byte(0) - if l.fieldsEscapedBy != "" { - escChar = l.fieldsEscapedBy[0] - } - termLen := len(l.fieldsTerminatedBy) - - for i := 0; i < len(line); i++ { - c := line[i] - - // Handle enclosure character - if c == encChar { - if inEnclosure { - // Check for doubled enclosure (escape mechanism when encChar == escChar) - if i+1 < len(line) && line[i+1] == encChar { - currentField.WriteByte(encChar) - i++ - continue - } - inEnclosure = false - continue - } - if currentField.Len() == 0 { - inEnclosure = true + var currentField strings.Builder + var encChar, escChar byte + if l.fieldsEnclosedBy != "" { + encChar = l.fieldsEnclosedBy[0] + } + if l.fieldsEscapedBy != "" { + escChar = l.fieldsEscapedBy[0] + } + termLen := len(l.fieldsTerminatedBy) + inEnclosure := false + + for i := 0; i < len(line); i++ { + ch := line[i] + if ch == encChar { + if inEnclosure { + // consume escaped char when encChar = escChar + if ch == escChar && i+1 < len(line) && line[i+1] == encChar { + currentField.WriteByte(encChar) + i++ continue } - } - - // Handle escape character (only when different from enclosure character) - if escChar != 0 && escChar != encChar && c == escChar && i+1 < len(line) { - currentField.WriteByte(c) - i++ - currentField.WriteByte(line[i]) + inEnclosure = false continue } - - // Handle field terminator (only outside enclosures) - if !inEnclosure && i+termLen <= len(line) && line[i:i+termLen] == l.fieldsTerminatedBy { - fields = append(fields, currentField.String()) - currentField.Reset() - i += termLen - 1 + if currentField.Len() == 0 { + inEnclosure = true continue } - - currentField.WriteByte(c) } - fields = append(fields, currentField.String()) - if !l.fieldsEnclosedByOpt && inEnclosure { - return nil, fmt.Errorf("error: unterminated enclosed field") + // we consumed the char above so we don't process when encChar = escChar + if escChar != encChar && ch == escChar && i+1 < len(line) { + i++ + switch line[i] { + case 'N': + currentField.WriteString("NULL") + case 'Z': + currentField.WriteByte(26) + case '0': + currentField.WriteByte(0) + case 'n': + currentField.WriteByte('\n') + case 't': + currentField.WriteByte('\t') + case 'r': + currentField.WriteByte('\r') + case 'b': + currentField.WriteByte('\b') + default: + currentField.WriteByte(line[i]) + } + continue } - } - // Handle ESCAPED BY parameter for special sequences like \N, \Z, \0, \n, \t, etc. - // When ESCAPED BY equals ENCLOSED BY, escaping was already handled via doubling. - if l.fieldsEscapedBy != "" && l.fieldsEscapedBy != l.fieldsEnclosedBy { - escByte := l.fieldsEscapedBy[0] - for i, field := range fields { - if !strings.ContainsRune(field, rune(escByte)) { - continue - } + if !inEnclosure && i+termLen <= len(line) && line[i:i+termLen] == l.fieldsTerminatedBy { + fields = append(fields, currentField.String()) + currentField.Reset() + i += termLen - 1 + continue + } - newField := make([]byte, 0, len(field)) - for j := 0; j < len(field); j++ { - if field[j] != escByte || j+1 >= len(field) { - newField = append(newField, field[j]) - continue - } + currentField.WriteByte(ch) + } - j++ - switch field[j] { - case 'N': - fields[i] = "NULL" - goto nextField - case 'Z': - newField = append(newField, 26) - case '0': - newField = append(newField, 0) - case 'n': - newField = append(newField, '\n') - case 't': - newField = append(newField, '\t') - case 'r': - newField = append(newField, '\r') - case 'b': - newField = append(newField, '\b') - default: - newField = append(newField, field[j]) - } - } - fields[i] = string(newField) - nextField: - } + fields = append(fields, currentField.String()) + if !l.fieldsEnclosedByOpt && inEnclosure { + return nil, fmt.Errorf("error: unterminated enclosed field") } fieldRow := make(sql.Row, len(fields)) From c6c7daff2268427356058f4bb0cdb54942765e38 Mon Sep 17 00:00:00 2001 From: elianddb Date: Mon, 20 Oct 2025 13:45:54 -0700 Subject: [PATCH 6/6] add line term in scanner split to detect eof when enc==esc edge case --- .gitattributes | 10 +- enginetest/queries/load_queries.go | 42 +++++++- enginetest/testdata/loaddata_9969.dat | 1 - enginetest/testdata/loaddata_enc_esc_eq.dat | 2 + enginetest/testdata/loaddata_enclosed.dat | 2 +- enginetest/testdata/loaddata_eof.dat | 2 + enginetest/testdata/loaddata_escape.dat | 2 +- enginetest/testdata/loaddata_lborder_null.dat | Bin 0 -> 10 bytes .../testdata/loaddata_mixed_escapes.dat | 2 +- .../testdata/loaddata_null_in_field.dat | Bin 0 -> 15 bytes enginetest/testdata/loaddata_nulls.dat | 2 +- .../testdata/loaddata_single_quotes.dat | 2 +- .../testdata/loaddata_term_in_field.dat | 1 + enginetest/testdata/simple_json.txt | 2 +- sql/plan/load_data.go | 5 +- sql/rowexec/ddl_iters.go | 96 ++++++++++-------- 16 files changed, 114 insertions(+), 57 deletions(-) delete mode 100644 enginetest/testdata/loaddata_9969.dat create mode 100644 enginetest/testdata/loaddata_enc_esc_eq.dat create mode 100644 enginetest/testdata/loaddata_eof.dat create mode 100644 enginetest/testdata/loaddata_lborder_null.dat create mode 100644 enginetest/testdata/loaddata_null_in_field.dat create mode 100644 enginetest/testdata/loaddata_term_in_field.dat diff --git a/.gitattributes b/.gitattributes index b75c1fc47f..33eef5d0d0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,9 +10,13 @@ enginetest/testdata/test8.txt binary enginetest/testdata/test9.txt binary enginetest/testdata/test10.txt binary enginetest/testdata/simple_json.txt binary -enginetest/testdata/loaddata_9969.dat binary -enginetest/testdata/loaddata_escape.dat binary +enginetest/testdata/loaddata_null_in_field.dat binary +enginetest/testdata/loaddata_lborder_null.dat binary +enginetest/testdata/loaddata_enc_esc_eq.dat binary +enginetest/testdata/loaddata_eof.dat binary +enginetest/testdata/loaddata_term_in_field.dat binary +enginetest/testdata/loaddata_mixed_escapes.dat binary enginetest/testdata/loaddata_enclosed.dat binary enginetest/testdata/loaddata_single_quotes.dat binary enginetest/testdata/loaddata_nulls.dat binary -enginetest/testdata/loaddata_mixed_escapes.dat binary \ No newline at end of file +enginetest/testdata/loaddata_escape.dat binary diff --git a/enginetest/queries/load_queries.go b/enginetest/queries/load_queries.go index e8ec6ad173..0b3e7ce4f1 100644 --- a/enginetest/queries/load_queries.go +++ b/enginetest/queries/load_queries.go @@ -29,7 +29,7 @@ var LoadDataScripts = []ScriptTest{ Name: "LOAD DATA with ENCLOSED BY and ESCAPED BY parsing", SetUpScript: []string{ "create table t1(pk int primary key, c1 longtext)", - "LOAD DATA INFILE './testdata/loaddata_9969.dat' INTO TABLE t1 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\"'", + "LOAD DATA INFILE './testdata/loaddata_term_in_field.dat' INTO TABLE t1 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\"'", "create table t2(pk int primary key, c1 longtext)", "LOAD DATA INFILE './testdata/loaddata_escape.dat' INTO TABLE t2 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\\\\'", "create table t3(a varchar(20), b varchar(20))", @@ -40,6 +40,14 @@ var LoadDataScripts = []ScriptTest{ "LOAD DATA INFILE './testdata/loaddata_single_quotes.dat' INTO TABLE t5 FIELDS TERMINATED BY ',' ENCLOSED BY ''''", "create table t6(pk int, a varchar(20), b varchar(20))", "LOAD DATA INFILE './testdata/loaddata_nulls.dat' INTO TABLE t6 FIELDS TERMINATED BY ','", + "create table t7(i int, v text)", + "LOAD DATA INFILE './testdata/loaddata_eof.dat' INTO TABLE t7 FIELDS TERMINATED BY ',' ENCLOSED BY '$' ESCAPED BY '$'", + "create table t8(i int, v text)", + "LOAD DATA INFILE './testdata/loaddata_enc_esc_eq.dat' INTO TABLE t8 FIELDS TERMINATED BY ',' ENCLOSED BY '$' ESCAPED BY '$'", + "create table t9(i int, v text)", + "LOAD DATA INFILE './testdata/loaddata_lborder_null.dat' INTO TABLE t9 FIELDS TERMINATED BY ',' ENCLOSED BY '' ESCAPED BY ''", + "create table t10(i int, v text)", + "LOAD DATA INFILE './testdata/loaddata_null_in_field.dat' INTO TABLE t10 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY ''", }, Assertions: []ScriptTestAssertion{ { @@ -67,7 +75,7 @@ var LoadDataScripts = []ScriptTest{ }, }, { - Query: "select * from t5", // order by a breaks + Query: "select * from t5", Expected: []sql.Row{ {"Field A", "Field B"}, {"Field 1", "Field 2"}, @@ -84,10 +92,36 @@ var LoadDataScripts = []ScriptTest{ {4, nil, nil}, }, }, + { + Query: "select * from t7", + Expected: []sql.Row{ + {1, "foo $0 $b $n $t $Z $N bar"}, + {2, "$foo $ bar$"}, + }, + }, + { + Query: "select * from t8", + Expected: []sql.Row{ + {1, "foo $0 $b $n $t $Z $N bar"}, + {2, "foo $ bar"}, + }, + }, + { + Query: "select * from t9", + Expected: []sql.Row{ + {1, "\x00foo bar"}, + }, + }, + { + Query: "select * from t10", + Expected: []sql.Row{ + {1, "foo \x00 bar"}, + }, + }, }, }, { - Name: "LOAD DATA applies column defaults when \\N provided", + Name: "LOAD DATA does not apply column defaults when \\N provided", SetUpScript: []string{ "create table t (pk int primary key, c1 int default 1, c2 int)", // Explicitly use Windows-style line endings to be robust on Windows CI @@ -96,7 +130,7 @@ var LoadDataScripts = []ScriptTest{ Assertions: []ScriptTestAssertion{ { Query: "select * from t", - Expected: []sql.Row{{1, 1, 1}}, + Expected: []sql.Row{{1, nil, 1}}, }, }, }, diff --git a/enginetest/testdata/loaddata_9969.dat b/enginetest/testdata/loaddata_9969.dat deleted file mode 100644 index 1e1eb13033..0000000000 --- a/enginetest/testdata/loaddata_9969.dat +++ /dev/null @@ -1 +0,0 @@ -"1","foo,bar" \ No newline at end of file diff --git a/enginetest/testdata/loaddata_enc_esc_eq.dat b/enginetest/testdata/loaddata_enc_esc_eq.dat new file mode 100644 index 0000000000..a59941afbc --- /dev/null +++ b/enginetest/testdata/loaddata_enc_esc_eq.dat @@ -0,0 +1,2 @@ +$1$,$foo $0 $b $n $t $Z $N bar$ +$2$,$foo $$ bar$ diff --git a/enginetest/testdata/loaddata_enclosed.dat b/enginetest/testdata/loaddata_enclosed.dat index f00e9d5897..686681d718 100644 --- a/enginetest/testdata/loaddata_enclosed.dat +++ b/enginetest/testdata/loaddata_enclosed.dat @@ -1,3 +1,3 @@ "field1","field2" "a""b","cd""ef" -"foo,bar","baz,qux" \ No newline at end of file +"foo,bar","baz,qux" diff --git a/enginetest/testdata/loaddata_eof.dat b/enginetest/testdata/loaddata_eof.dat new file mode 100644 index 0000000000..503c07e7bb --- /dev/null +++ b/enginetest/testdata/loaddata_eof.dat @@ -0,0 +1,2 @@ +$1$,$foo $0 $b $n $t $Z $N bar$ +$2$,$foo $$ bar$ \ No newline at end of file diff --git a/enginetest/testdata/loaddata_escape.dat b/enginetest/testdata/loaddata_escape.dat index 395e5e45d3..dd71618bf0 100644 --- a/enginetest/testdata/loaddata_escape.dat +++ b/enginetest/testdata/loaddata_escape.dat @@ -1 +1 @@ -"1","foo\,bar" \ No newline at end of file +"1","foo\,bar" diff --git a/enginetest/testdata/loaddata_lborder_null.dat b/enginetest/testdata/loaddata_lborder_null.dat new file mode 100644 index 0000000000000000000000000000000000000000..3420ea8da6ed61704254f77d05d16f2cc6a06225 GIT binary patch literal 10 RcmXriVMxo*S4c`M0ssx^0`~v_ literal 0 HcmV?d00001 diff --git a/enginetest/testdata/loaddata_mixed_escapes.dat b/enginetest/testdata/loaddata_mixed_escapes.dat index 5da4c8ba6a..3fe892d76a 100644 --- a/enginetest/testdata/loaddata_mixed_escapes.dat +++ b/enginetest/testdata/loaddata_mixed_escapes.dat @@ -1,3 +1,3 @@ "hello\nworld","foo\tbar" "a\,b","c\,d" -"\N","\Z" \ No newline at end of file +"\N","\Z" diff --git a/enginetest/testdata/loaddata_null_in_field.dat b/enginetest/testdata/loaddata_null_in_field.dat new file mode 100644 index 0000000000000000000000000000000000000000..0157cbb9fd6bfe882d8328f51d2d3fe04d3c092a GIT binary patch literal 15 WcmY!~RMJsO%g= 0 { - return i + len(l.LinesTerminatedBy), data[0:i], nil + // Include the terminator in the token so parser can detect EOF vs terminated lines + return i + len(l.LinesTerminatedBy), data[0 : i+len(l.LinesTerminatedBy)], nil } - // If at end of file with data return the data. + // If at end of file with data return the data (no terminator present = EOF) if atEOF { return len(data), data, nil } diff --git a/sql/rowexec/ddl_iters.go b/sql/rowexec/ddl_iters.go index 16be51d784..3a58743c7c 100644 --- a/sql/rowexec/ddl_iters.go +++ b/sql/rowexec/ddl_iters.go @@ -130,47 +130,66 @@ func (l *loadDataIter) parseLinePrefix(line string) string { } } -func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Expression, error) { - // Start by searching for prefix if there is one +func (l *loadDataIter) parseFields(ctx *sql.Context, line string) (exprs []sql.Expression, err error) { + // Step 1. Start by Searching for prefix if there is one line = l.parseLinePrefix(line) if line == "" { return nil, nil } - // line is parsed character-by-character to respect enclosed fields that may contain the field terminator + // Step 2: Split the lines into fields given the delim, respecting ENCLOSED BY. + // Fields enclosed by the enclosure character can contain the field terminator. + // TODO: Support the OPTIONALLY parameter. + + // Check if line has terminator (if not, it ended at EOF) + hasTerminator := strings.HasSuffix(line, l.linesTerminatedBy) + if hasTerminator { + line = line[:len(line)-len(l.linesTerminatedBy)] + } + var fields []string var currentField strings.Builder - var encChar, escChar byte - if l.fieldsEnclosedBy != "" { - encChar = l.fieldsEnclosedBy[0] - } - if l.fieldsEscapedBy != "" { - escChar = l.fieldsEscapedBy[0] - } - termLen := len(l.fieldsTerminatedBy) inEnclosure := false + termLen := len(l.fieldsTerminatedBy) + hasEnc := l.fieldsEnclosedBy != "" + hasEsc := l.fieldsEscapedBy != "" + encEqualsEsc := hasEnc && hasEsc && l.fieldsEnclosedBy == l.fieldsEscapedBy + // False only at EOF with enc==esc: ambiguous whether final char closes field or is literal data + normalLineTerm := hasTerminator || !encEqualsEsc for i := 0; i < len(line); i++ { ch := line[i] - if ch == encChar { - if inEnclosure { - // consume escaped char when encChar = escChar - if ch == escChar && i+1 < len(line) && line[i+1] == encChar { - currentField.WriteByte(encChar) - i++ - continue - } + isEncChar := hasEnc && ch == l.fieldsEnclosedBy[0] + // When enc==esc, doubling handles escaping (e.g., $$ -> $), not escape sequences + isEscChar := hasEsc && !encEqualsEsc && ch == l.fieldsEscapedBy[0] + + // Start enclosure at beginning of field + if isEncChar && !inEnclosure && currentField.Len() == 0 { + inEnclosure = true + continue + } + + // Special case: escaped enclosure character does not end enclosure and is written literally + if isEncChar && inEnclosure && encEqualsEsc && i+1 < len(line) && line[i+1] == l.fieldsEnclosedBy[0] { + currentField.WriteByte(l.fieldsEnclosedBy[0]) + i++ + continue + } + + // Close enclosure if followed by field terminator or at end of line + if isEncChar && inEnclosure { + followedByTerm := i+1+termLen <= len(line) && line[i+1:i+1+termLen] == l.fieldsTerminatedBy + atLineEnd := i+1 >= len(line) + if followedByTerm || (atLineEnd && normalLineTerm) { inEnclosure = false continue } - if currentField.Len() == 0 { - inEnclosure = true - continue - } + // Enclosure char in middle of field, treat as literal + currentField.WriteByte(ch) + continue } - // we consumed the char above so we don't process when encChar = escChar - if escChar != encChar && ch == escChar && i+1 < len(line) { + if isEscChar && i+1 < len(line) { i++ switch line[i] { case 'N': @@ -193,6 +212,7 @@ func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Express continue } + // Handle field terminator (only outside enclosures) if !inEnclosure && i+termLen <= len(line) && line[i:i+termLen] == l.fieldsTerminatedBy { fields = append(fields, currentField.String()) currentField.Reset() @@ -203,8 +223,14 @@ func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Express currentField.WriteByte(ch) } - fields = append(fields, currentField.String()) - if !l.fieldsEnclosedByOpt && inEnclosure { + lastField := currentField.String() + // If still in enclosure at EOF when enc==esc, prepend the opening enclosure that was stripped + if inEnclosure && !normalLineTerm { + lastField = string(l.fieldsEnclosedBy[0]) + lastField + } + fields = append(fields, lastField) + + if inEnclosure && normalLineTerm { return nil, fmt.Errorf("error: unterminated enclosed field") } @@ -213,7 +239,7 @@ func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Express fieldRow[i] = field } - exprs := make([]sql.Expression, len(l.destSch)) + exprs = make([]sql.Expression, len(l.destSch)) for fieldIdx, exprIdx := 0, 0; fieldIdx < len(fields) && fieldIdx < len(l.userVars); fieldIdx++ { if l.userVars[fieldIdx] != nil { setField := l.userVars[fieldIdx].(*expression.SetField) @@ -246,19 +272,7 @@ func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Express } } case "NULL": - // For MySQL LOAD DATA semantics, \N (mapped to NULL here) should use the column default - // if one exists; otherwise insert NULL. - destIdx := l.fieldToColMap[fieldIdx] - if destIdx >= 0 { - destCol := l.destSch[destIdx] - if destCol.Default != nil { - exprs[exprIdx] = destCol.Default - } else { - exprs[exprIdx] = expression.NewLiteral(nil, types.Null) - } - } else { - exprs[exprIdx] = expression.NewLiteral(nil, types.Null) - } + exprs[exprIdx] = expression.NewLiteral(nil, types.Null) default: exprs[exprIdx] = expression.NewLiteral(field, types.LongText) }