Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 22 additions & 12 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
enginetest/testdata/test1.txt binary
enginetest/testdata/test2.csv binary
enginetest/testdata/test3.csv binary
enginetest/testdata/test3backwards.csv binary
enginetest/testdata/test4.txt binary
enginetest/testdata/test5.txt binary
enginetest/testdata/test6.csv binary
enginetest/testdata/test7.txt binary
enginetest/testdata/test8.txt binary
enginetest/testdata/test9.txt binary
enginetest/testdata/test10.txt binary
enginetest/testdata/simple_json.txt binary
enginetest/testdata/test1.txt binary
enginetest/testdata/test2.csv binary
enginetest/testdata/test3.csv binary
enginetest/testdata/test3backwards.csv binary
enginetest/testdata/test4.txt binary
enginetest/testdata/test5.txt binary
enginetest/testdata/test6.csv binary
enginetest/testdata/test7.txt binary
enginetest/testdata/test8.txt binary
enginetest/testdata/test9.txt binary
enginetest/testdata/test10.txt binary
enginetest/testdata/simple_json.txt binary
enginetest/testdata/loaddata_null_in_field.dat binary
enginetest/testdata/loaddata_lborder_null.dat binary
enginetest/testdata/loaddata_enc_esc_eq.dat binary
enginetest/testdata/loaddata_eof.dat binary
enginetest/testdata/loaddata_term_in_field.dat binary
enginetest/testdata/loaddata_mixed_escapes.dat binary
enginetest/testdata/loaddata_enclosed.dat binary
enginetest/testdata/loaddata_single_quotes.dat binary
enginetest/testdata/loaddata_nulls.dat binary
enginetest/testdata/loaddata_escape.dat binary
100 changes: 98 additions & 2 deletions enginetest/queries/load_queries.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,103 @@ import (

var LoadDataScripts = []ScriptTest{
{
Name: "LOAD DATA applies column defaults when \\N provided",
// https://github.com/dolthub/dolt/issues/9969
Name: "LOAD DATA with ENCLOSED BY and ESCAPED BY parsing",
SetUpScript: []string{
"create table t1(pk int primary key, c1 longtext)",
"LOAD DATA INFILE './testdata/loaddata_term_in_field.dat' INTO TABLE t1 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\"'",
"create table t2(pk int primary key, c1 longtext)",
"LOAD DATA INFILE './testdata/loaddata_escape.dat' INTO TABLE t2 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\\\\'",
"create table t3(a varchar(20), b varchar(20))",
"LOAD DATA INFILE './testdata/loaddata_enclosed.dat' INTO TABLE t3 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\"'",
"create table t4(a varchar(20), b varchar(20))",
"LOAD DATA INFILE './testdata/loaddata_mixed_escapes.dat' INTO TABLE t4 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\\\\'",
"create table t5(a text, b text)",
"LOAD DATA INFILE './testdata/loaddata_single_quotes.dat' INTO TABLE t5 FIELDS TERMINATED BY ',' ENCLOSED BY ''''",
"create table t6(pk int, a varchar(20), b varchar(20))",
"LOAD DATA INFILE './testdata/loaddata_nulls.dat' INTO TABLE t6 FIELDS TERMINATED BY ','",
"create table t7(i int, v text)",
"LOAD DATA INFILE './testdata/loaddata_eof.dat' INTO TABLE t7 FIELDS TERMINATED BY ',' ENCLOSED BY '$' ESCAPED BY '$'",
"create table t8(i int, v text)",
"LOAD DATA INFILE './testdata/loaddata_enc_esc_eq.dat' INTO TABLE t8 FIELDS TERMINATED BY ',' ENCLOSED BY '$' ESCAPED BY '$'",
"create table t9(i int, v text)",
"LOAD DATA INFILE './testdata/loaddata_lborder_null.dat' INTO TABLE t9 FIELDS TERMINATED BY ',' ENCLOSED BY '' ESCAPED BY ''",
"create table t10(i int, v text)",
"LOAD DATA INFILE './testdata/loaddata_null_in_field.dat' INTO TABLE t10 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY ''",
},
Assertions: []ScriptTestAssertion{
{
Query: "select * from t1",
Expected: []sql.Row{{1, "foo,bar"}},
},
{
Query: "select * from t2",
Expected: []sql.Row{{1, "foo,bar"}},
},
{
Query: "select * from t3 ORDER BY a",
Expected: []sql.Row{
{"a\"b", "cd\"ef"},
{"field1", "field2"},
{"foo,bar", "baz,qux"},
},
},
{
Query: "select * from t4",
Expected: []sql.Row{
{nil, "\x1A"},
{"a,b", "c,d"},
{"hello\nworld", "foo\tbar"},
},
},
{
Query: "select * from t5",
Expected: []sql.Row{
{"Field A", "Field B"},
{"Field 1", "Field 2"},
{"Field 3", "Field 4"},
{"Field 5", "Field 6"},
},
},
{
Query: "select * from t6 ORDER BY pk",
Expected: []sql.Row{
{1, "hello", "world"},
{2, nil, "test"},
{3, "", "empty"},
{4, nil, nil},
},
},
{
Query: "select * from t7",
Expected: []sql.Row{
{1, "foo $0 $b $n $t $Z $N bar"},
{2, "$foo $ bar$"},
},
},
{
Query: "select * from t8",
Expected: []sql.Row{
{1, "foo $0 $b $n $t $Z $N bar"},
{2, "foo $ bar"},
},
},
{
Query: "select * from t9",
Expected: []sql.Row{
{1, "\x00foo bar"},
},
},
{
Query: "select * from t10",
Expected: []sql.Row{
{1, "foo \x00 bar"},
},
},
},
},
{
Name: "LOAD DATA does not apply column defaults when \\N provided",
SetUpScript: []string{
"create table t (pk int primary key, c1 int default 1, c2 int)",
// Explicitly use Windows-style line endings to be robust on Windows CI
Expand All @@ -34,7 +130,7 @@ var LoadDataScripts = []ScriptTest{
Assertions: []ScriptTestAssertion{
{
Query: "select * from t",
Expected: []sql.Row{{1, 1, 1}},
Expected: []sql.Row{{1, nil, 1}},
},
},
},
Expand Down
2 changes: 2 additions & 0 deletions enginetest/testdata/loaddata_enc_esc_eq.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
$1$,$foo $0 $b $n $t $Z $N bar$
$2$,$foo $$ bar$
3 changes: 3 additions & 0 deletions enginetest/testdata/loaddata_enclosed.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"field1","field2"
"a""b","cd""ef"
"foo,bar","baz,qux"
2 changes: 2 additions & 0 deletions enginetest/testdata/loaddata_eof.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
$1$,$foo $0 $b $n $t $Z $N bar$
$2$,$foo $$ bar$
1 change: 1 addition & 0 deletions enginetest/testdata/loaddata_escape.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"1","foo\,bar"
Binary file added enginetest/testdata/loaddata_lborder_null.dat
Binary file not shown.
3 changes: 3 additions & 0 deletions enginetest/testdata/loaddata_mixed_escapes.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"hello\nworld","foo\tbar"
"a\,b","c\,d"
"\N","\Z"
Binary file added enginetest/testdata/loaddata_null_in_field.dat
Binary file not shown.
4 changes: 4 additions & 0 deletions enginetest/testdata/loaddata_nulls.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
1,hello,world
2,\N,test
3,,empty
4,\N,\N
4 changes: 4 additions & 0 deletions enginetest/testdata/loaddata_single_quotes.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Field A,'Field B'
Field 1,'Field 2'
Field 3,'Field 4'
'Field 5','Field 6'
1 change: 1 addition & 0 deletions enginetest/testdata/loaddata_term_in_field.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"1","foo,bar"
2 changes: 1 addition & 1 deletion enginetest/testdata/simple_json.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
"1","{""foo"":""bar""}"
"1","{""foo"":""bar""}"
5 changes: 3 additions & 2 deletions sql/plan/load_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,11 @@ func (l *LoadData) SplitLines(data []byte, atEOF bool) (advance int, token []byt

// Find the index of the LINES TERMINATED BY delim.
if i := bytes.Index(data, []byte(l.LinesTerminatedBy)); i >= 0 {
return i + len(l.LinesTerminatedBy), data[0:i], nil
// Include the terminator in the token so parser can detect EOF vs terminated lines
return i + len(l.LinesTerminatedBy), data[0 : i+len(l.LinesTerminatedBy)], nil
}

// If at end of file with data return the data.
// If at end of file with data return the data (no terminator present = EOF)
if atEOF {
return len(data), data, nil
}
Expand Down
144 changes: 92 additions & 52 deletions sql/rowexec/ddl_iters.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,64 +130,116 @@ func (l *loadDataIter) parseLinePrefix(line string) string {
}
}

func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Expression, error) {
func (l *loadDataIter) parseFields(ctx *sql.Context, line string) (exprs []sql.Expression, err error) {
// Step 1. Start by Searching for prefix if there is one
line = l.parseLinePrefix(line)
if line == "" {
return nil, nil
}

// Step 2: Split the lines into fields given the delim
fields := strings.Split(line, l.fieldsTerminatedBy)

// Step 3: Go through each field and see if it was enclosed by something
// Step 2: Split the lines into fields given the delim, respecting ENCLOSED BY.
// Fields enclosed by the enclosure character can contain the field terminator.
// TODO: Support the OPTIONALLY parameter.
if l.fieldsEnclosedBy != "" {
for i, field := range fields {
if field[0] == l.fieldsEnclosedBy[0] && field[len(field)-1] == l.fieldsEnclosedBy[0] {
fields[i] = field[1 : len(field)-1]
} else {
return nil, fmt.Errorf("error: field not properly enclosed")

// Check if line has terminator (if not, it ended at EOF)
hasTerminator := strings.HasSuffix(line, l.linesTerminatedBy)
if hasTerminator {
line = line[:len(line)-len(l.linesTerminatedBy)]
}

var fields []string
var currentField strings.Builder
inEnclosure := false
termLen := len(l.fieldsTerminatedBy)
hasEnc := l.fieldsEnclosedBy != ""
hasEsc := l.fieldsEscapedBy != ""
encEqualsEsc := hasEnc && hasEsc && l.fieldsEnclosedBy == l.fieldsEscapedBy
// False only at EOF with enc==esc: ambiguous whether final char closes field or is literal data
normalLineTerm := hasTerminator || !encEqualsEsc

for i := 0; i < len(line); i++ {
ch := line[i]
isEncChar := hasEnc && ch == l.fieldsEnclosedBy[0]
// When enc==esc, doubling handles escaping (e.g., $$ -> $), not escape sequences
isEscChar := hasEsc && !encEqualsEsc && ch == l.fieldsEscapedBy[0]

// Start enclosure at beginning of field
if isEncChar && !inEnclosure && currentField.Len() == 0 {
inEnclosure = true
continue
}

// Special case: escaped enclosure character does not end enclosure and is written literally
if isEncChar && inEnclosure && encEqualsEsc && i+1 < len(line) && line[i+1] == l.fieldsEnclosedBy[0] {
currentField.WriteByte(l.fieldsEnclosedBy[0])
i++
continue
}

// Close enclosure if followed by field terminator or at end of line
if isEncChar && inEnclosure {
followedByTerm := i+1+termLen <= len(line) && line[i+1:i+1+termLen] == l.fieldsTerminatedBy
atLineEnd := i+1 >= len(line)
if followedByTerm || (atLineEnd && normalLineTerm) {
inEnclosure = false
continue
}
// Enclosure char in middle of field, treat as literal
currentField.WriteByte(ch)
continue
}
}

// Step 4: Handle the ESCAPED BY parameter.
if l.fieldsEscapedBy != "" {
for i, field := range fields {
if field == "\\N" {
fields[i] = "NULL"
} else if field == "\\Z" {
fields[i] = fmt.Sprintf("%c", 26) // ASCII 26
} else if field == "\\0" {
fields[i] = fmt.Sprintf("%c", 0) // ASCII 0
} else {
// The character immediately following the escaped character remains untouched, even if it is the same
// as the escape character
newField := make([]byte, 0, len(field))
for cIdx := 0; cIdx < len(field); cIdx++ {
c := field[cIdx]
// skip over escaped character, but always add the following character
if c == l.fieldsEscapedBy[0] {
cIdx += 1
if cIdx < len(field) {
newField = append(newField, c)
}
continue
}
newField = append(newField, c)
}
fields[i] = string(newField)
if isEscChar && i+1 < len(line) {
i++
switch line[i] {
case 'N':
currentField.WriteString("NULL")
case 'Z':
currentField.WriteByte(26)
case '0':
currentField.WriteByte(0)
case 'n':
currentField.WriteByte('\n')
case 't':
currentField.WriteByte('\t')
case 'r':
currentField.WriteByte('\r')
case 'b':
currentField.WriteByte('\b')
default:
currentField.WriteByte(line[i])
}
continue
}

// Handle field terminator (only outside enclosures)
if !inEnclosure && i+termLen <= len(line) && line[i:i+termLen] == l.fieldsTerminatedBy {
fields = append(fields, currentField.String())
currentField.Reset()
i += termLen - 1
continue
}

currentField.WriteByte(ch)
}

lastField := currentField.String()
// If still in enclosure at EOF when enc==esc, prepend the opening enclosure that was stripped
if inEnclosure && !normalLineTerm {
lastField = string(l.fieldsEnclosedBy[0]) + lastField
}
fields = append(fields, lastField)

if inEnclosure && normalLineTerm {
return nil, fmt.Errorf("error: unterminated enclosed field")
}

fieldRow := make(sql.Row, len(fields))
for i, field := range fields {
fieldRow[i] = field
}

exprs := make([]sql.Expression, len(l.destSch))
exprs = make([]sql.Expression, len(l.destSch))
for fieldIdx, exprIdx := 0, 0; fieldIdx < len(fields) && fieldIdx < len(l.userVars); fieldIdx++ {
if l.userVars[fieldIdx] != nil {
setField := l.userVars[fieldIdx].(*expression.SetField)
Expand Down Expand Up @@ -220,19 +272,7 @@ func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Express
}
}
case "NULL":
// For MySQL LOAD DATA semantics, \N (mapped to NULL here) should use the column default
// if one exists; otherwise insert NULL.
destIdx := l.fieldToColMap[fieldIdx]
if destIdx >= 0 {
destCol := l.destSch[destIdx]
if destCol.Default != nil {
exprs[exprIdx] = destCol.Default
} else {
exprs[exprIdx] = expression.NewLiteral(nil, types.Null)
}
} else {
exprs[exprIdx] = expression.NewLiteral(nil, types.Null)
}
exprs[exprIdx] = expression.NewLiteral(nil, types.Null)
default:
exprs[exprIdx] = expression.NewLiteral(field, types.LongText)
}
Expand Down
Loading