diff --git a/.gitattributes b/.gitattributes index 6168d3d095..33eef5d0d0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,12 +1,22 @@ -enginetest/testdata/test1.txt binary -enginetest/testdata/test2.csv binary -enginetest/testdata/test3.csv binary -enginetest/testdata/test3backwards.csv binary -enginetest/testdata/test4.txt binary -enginetest/testdata/test5.txt binary -enginetest/testdata/test6.csv binary -enginetest/testdata/test7.txt binary -enginetest/testdata/test8.txt binary -enginetest/testdata/test9.txt binary -enginetest/testdata/test10.txt binary -enginetest/testdata/simple_json.txt binary \ No newline at end of file +enginetest/testdata/test1.txt binary +enginetest/testdata/test2.csv binary +enginetest/testdata/test3.csv binary +enginetest/testdata/test3backwards.csv binary +enginetest/testdata/test4.txt binary +enginetest/testdata/test5.txt binary +enginetest/testdata/test6.csv binary +enginetest/testdata/test7.txt binary +enginetest/testdata/test8.txt binary +enginetest/testdata/test9.txt binary +enginetest/testdata/test10.txt binary +enginetest/testdata/simple_json.txt binary +enginetest/testdata/loaddata_null_in_field.dat binary +enginetest/testdata/loaddata_lborder_null.dat binary +enginetest/testdata/loaddata_enc_esc_eq.dat binary +enginetest/testdata/loaddata_eof.dat binary +enginetest/testdata/loaddata_term_in_field.dat binary +enginetest/testdata/loaddata_mixed_escapes.dat binary +enginetest/testdata/loaddata_enclosed.dat binary +enginetest/testdata/loaddata_single_quotes.dat binary +enginetest/testdata/loaddata_nulls.dat binary +enginetest/testdata/loaddata_escape.dat binary diff --git a/enginetest/queries/load_queries.go b/enginetest/queries/load_queries.go index 7006f25b8c..0b3e7ce4f1 100644 --- a/enginetest/queries/load_queries.go +++ b/enginetest/queries/load_queries.go @@ -25,7 +25,103 @@ import ( var LoadDataScripts = []ScriptTest{ { - Name: "LOAD DATA applies column defaults when \\N provided", + // https://github.com/dolthub/dolt/issues/9969 + Name: "LOAD DATA with ENCLOSED BY and ESCAPED BY parsing", + SetUpScript: []string{ + "create table t1(pk int primary key, c1 longtext)", + "LOAD DATA INFILE './testdata/loaddata_term_in_field.dat' INTO TABLE t1 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\"'", + "create table t2(pk int primary key, c1 longtext)", + "LOAD DATA INFILE './testdata/loaddata_escape.dat' INTO TABLE t2 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\\\\'", + "create table t3(a varchar(20), b varchar(20))", + "LOAD DATA INFILE './testdata/loaddata_enclosed.dat' INTO TABLE t3 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\"'", + "create table t4(a varchar(20), b varchar(20))", + "LOAD DATA INFILE './testdata/loaddata_mixed_escapes.dat' INTO TABLE t4 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\\\\'", + "create table t5(a text, b text)", + "LOAD DATA INFILE './testdata/loaddata_single_quotes.dat' INTO TABLE t5 FIELDS TERMINATED BY ',' ENCLOSED BY ''''", + "create table t6(pk int, a varchar(20), b varchar(20))", + "LOAD DATA INFILE './testdata/loaddata_nulls.dat' INTO TABLE t6 FIELDS TERMINATED BY ','", + "create table t7(i int, v text)", + "LOAD DATA INFILE './testdata/loaddata_eof.dat' INTO TABLE t7 FIELDS TERMINATED BY ',' ENCLOSED BY '$' ESCAPED BY '$'", + "create table t8(i int, v text)", + "LOAD DATA INFILE './testdata/loaddata_enc_esc_eq.dat' INTO TABLE t8 FIELDS TERMINATED BY ',' ENCLOSED BY '$' ESCAPED BY '$'", + "create table t9(i int, v text)", + "LOAD DATA INFILE './testdata/loaddata_lborder_null.dat' INTO TABLE t9 FIELDS TERMINATED BY ',' ENCLOSED BY '' ESCAPED BY ''", + "create table t10(i int, v text)", + "LOAD DATA INFILE './testdata/loaddata_null_in_field.dat' INTO TABLE t10 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY ''", + }, + Assertions: []ScriptTestAssertion{ + { + Query: "select * from t1", + Expected: []sql.Row{{1, "foo,bar"}}, + }, + { + Query: "select * from t2", + Expected: []sql.Row{{1, "foo,bar"}}, + }, + { + Query: "select * from t3 ORDER BY a", + Expected: []sql.Row{ + {"a\"b", "cd\"ef"}, + {"field1", "field2"}, + {"foo,bar", "baz,qux"}, + }, + }, + { + Query: "select * from t4", + Expected: []sql.Row{ + {nil, "\x1A"}, + {"a,b", "c,d"}, + {"hello\nworld", "foo\tbar"}, + }, + }, + { + Query: "select * from t5", + Expected: []sql.Row{ + {"Field A", "Field B"}, + {"Field 1", "Field 2"}, + {"Field 3", "Field 4"}, + {"Field 5", "Field 6"}, + }, + }, + { + Query: "select * from t6 ORDER BY pk", + Expected: []sql.Row{ + {1, "hello", "world"}, + {2, nil, "test"}, + {3, "", "empty"}, + {4, nil, nil}, + }, + }, + { + Query: "select * from t7", + Expected: []sql.Row{ + {1, "foo $0 $b $n $t $Z $N bar"}, + {2, "$foo $ bar$"}, + }, + }, + { + Query: "select * from t8", + Expected: []sql.Row{ + {1, "foo $0 $b $n $t $Z $N bar"}, + {2, "foo $ bar"}, + }, + }, + { + Query: "select * from t9", + Expected: []sql.Row{ + {1, "\x00foo bar"}, + }, + }, + { + Query: "select * from t10", + Expected: []sql.Row{ + {1, "foo \x00 bar"}, + }, + }, + }, + }, + { + Name: "LOAD DATA does not apply column defaults when \\N provided", SetUpScript: []string{ "create table t (pk int primary key, c1 int default 1, c2 int)", // Explicitly use Windows-style line endings to be robust on Windows CI @@ -34,7 +130,7 @@ var LoadDataScripts = []ScriptTest{ Assertions: []ScriptTestAssertion{ { Query: "select * from t", - Expected: []sql.Row{{1, 1, 1}}, + Expected: []sql.Row{{1, nil, 1}}, }, }, }, diff --git a/enginetest/testdata/loaddata_enc_esc_eq.dat b/enginetest/testdata/loaddata_enc_esc_eq.dat new file mode 100644 index 0000000000..a59941afbc --- /dev/null +++ b/enginetest/testdata/loaddata_enc_esc_eq.dat @@ -0,0 +1,2 @@ +$1$,$foo $0 $b $n $t $Z $N bar$ +$2$,$foo $$ bar$ diff --git a/enginetest/testdata/loaddata_enclosed.dat b/enginetest/testdata/loaddata_enclosed.dat new file mode 100644 index 0000000000..686681d718 --- /dev/null +++ b/enginetest/testdata/loaddata_enclosed.dat @@ -0,0 +1,3 @@ +"field1","field2" +"a""b","cd""ef" +"foo,bar","baz,qux" diff --git a/enginetest/testdata/loaddata_eof.dat b/enginetest/testdata/loaddata_eof.dat new file mode 100644 index 0000000000..503c07e7bb --- /dev/null +++ b/enginetest/testdata/loaddata_eof.dat @@ -0,0 +1,2 @@ +$1$,$foo $0 $b $n $t $Z $N bar$ +$2$,$foo $$ bar$ \ No newline at end of file diff --git a/enginetest/testdata/loaddata_escape.dat b/enginetest/testdata/loaddata_escape.dat new file mode 100644 index 0000000000..dd71618bf0 --- /dev/null +++ b/enginetest/testdata/loaddata_escape.dat @@ -0,0 +1 @@ +"1","foo\,bar" diff --git a/enginetest/testdata/loaddata_lborder_null.dat b/enginetest/testdata/loaddata_lborder_null.dat new file mode 100644 index 0000000000..3420ea8da6 Binary files /dev/null and b/enginetest/testdata/loaddata_lborder_null.dat differ diff --git a/enginetest/testdata/loaddata_mixed_escapes.dat b/enginetest/testdata/loaddata_mixed_escapes.dat new file mode 100644 index 0000000000..3fe892d76a --- /dev/null +++ b/enginetest/testdata/loaddata_mixed_escapes.dat @@ -0,0 +1,3 @@ +"hello\nworld","foo\tbar" +"a\,b","c\,d" +"\N","\Z" diff --git a/enginetest/testdata/loaddata_null_in_field.dat b/enginetest/testdata/loaddata_null_in_field.dat new file mode 100644 index 0000000000..0157cbb9fd Binary files /dev/null and b/enginetest/testdata/loaddata_null_in_field.dat differ diff --git a/enginetest/testdata/loaddata_nulls.dat b/enginetest/testdata/loaddata_nulls.dat new file mode 100644 index 0000000000..403abdc49f --- /dev/null +++ b/enginetest/testdata/loaddata_nulls.dat @@ -0,0 +1,4 @@ +1,hello,world +2,\N,test +3,,empty +4,\N,\N diff --git a/enginetest/testdata/loaddata_single_quotes.dat b/enginetest/testdata/loaddata_single_quotes.dat new file mode 100644 index 0000000000..57e010a481 --- /dev/null +++ b/enginetest/testdata/loaddata_single_quotes.dat @@ -0,0 +1,4 @@ +Field A,'Field B' +Field 1,'Field 2' +Field 3,'Field 4' +'Field 5','Field 6' diff --git a/enginetest/testdata/loaddata_term_in_field.dat b/enginetest/testdata/loaddata_term_in_field.dat new file mode 100644 index 0000000000..c8d533f971 --- /dev/null +++ b/enginetest/testdata/loaddata_term_in_field.dat @@ -0,0 +1 @@ +"1","foo,bar" diff --git a/enginetest/testdata/simple_json.txt b/enginetest/testdata/simple_json.txt index e0290f3598..79475cfed0 100644 --- a/enginetest/testdata/simple_json.txt +++ b/enginetest/testdata/simple_json.txt @@ -1 +1 @@ -"1","{""foo"":""bar""}" \ No newline at end of file +"1","{""foo"":""bar""}" diff --git a/sql/plan/load_data.go b/sql/plan/load_data.go index aa7609877e..b4acc61d3d 100644 --- a/sql/plan/load_data.go +++ b/sql/plan/load_data.go @@ -77,10 +77,11 @@ func (l *LoadData) SplitLines(data []byte, atEOF bool) (advance int, token []byt // Find the index of the LINES TERMINATED BY delim. if i := bytes.Index(data, []byte(l.LinesTerminatedBy)); i >= 0 { - return i + len(l.LinesTerminatedBy), data[0:i], nil + // Include the terminator in the token so parser can detect EOF vs terminated lines + return i + len(l.LinesTerminatedBy), data[0 : i+len(l.LinesTerminatedBy)], nil } - // If at end of file with data return the data. + // If at end of file with data return the data (no terminator present = EOF) if atEOF { return len(data), data, nil } diff --git a/sql/rowexec/ddl_iters.go b/sql/rowexec/ddl_iters.go index ecbc16be97..3a58743c7c 100644 --- a/sql/rowexec/ddl_iters.go +++ b/sql/rowexec/ddl_iters.go @@ -130,56 +130,108 @@ func (l *loadDataIter) parseLinePrefix(line string) string { } } -func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Expression, error) { +func (l *loadDataIter) parseFields(ctx *sql.Context, line string) (exprs []sql.Expression, err error) { // Step 1. Start by Searching for prefix if there is one line = l.parseLinePrefix(line) if line == "" { return nil, nil } - // Step 2: Split the lines into fields given the delim - fields := strings.Split(line, l.fieldsTerminatedBy) - - // Step 3: Go through each field and see if it was enclosed by something + // Step 2: Split the lines into fields given the delim, respecting ENCLOSED BY. + // Fields enclosed by the enclosure character can contain the field terminator. // TODO: Support the OPTIONALLY parameter. - if l.fieldsEnclosedBy != "" { - for i, field := range fields { - if field[0] == l.fieldsEnclosedBy[0] && field[len(field)-1] == l.fieldsEnclosedBy[0] { - fields[i] = field[1 : len(field)-1] - } else { - return nil, fmt.Errorf("error: field not properly enclosed") + + // Check if line has terminator (if not, it ended at EOF) + hasTerminator := strings.HasSuffix(line, l.linesTerminatedBy) + if hasTerminator { + line = line[:len(line)-len(l.linesTerminatedBy)] + } + + var fields []string + var currentField strings.Builder + inEnclosure := false + termLen := len(l.fieldsTerminatedBy) + hasEnc := l.fieldsEnclosedBy != "" + hasEsc := l.fieldsEscapedBy != "" + encEqualsEsc := hasEnc && hasEsc && l.fieldsEnclosedBy == l.fieldsEscapedBy + // False only at EOF with enc==esc: ambiguous whether final char closes field or is literal data + normalLineTerm := hasTerminator || !encEqualsEsc + + for i := 0; i < len(line); i++ { + ch := line[i] + isEncChar := hasEnc && ch == l.fieldsEnclosedBy[0] + // When enc==esc, doubling handles escaping (e.g., $$ -> $), not escape sequences + isEscChar := hasEsc && !encEqualsEsc && ch == l.fieldsEscapedBy[0] + + // Start enclosure at beginning of field + if isEncChar && !inEnclosure && currentField.Len() == 0 { + inEnclosure = true + continue + } + + // Special case: escaped enclosure character does not end enclosure and is written literally + if isEncChar && inEnclosure && encEqualsEsc && i+1 < len(line) && line[i+1] == l.fieldsEnclosedBy[0] { + currentField.WriteByte(l.fieldsEnclosedBy[0]) + i++ + continue + } + + // Close enclosure if followed by field terminator or at end of line + if isEncChar && inEnclosure { + followedByTerm := i+1+termLen <= len(line) && line[i+1:i+1+termLen] == l.fieldsTerminatedBy + atLineEnd := i+1 >= len(line) + if followedByTerm || (atLineEnd && normalLineTerm) { + inEnclosure = false + continue } + // Enclosure char in middle of field, treat as literal + currentField.WriteByte(ch) + continue } - } - // Step 4: Handle the ESCAPED BY parameter. - if l.fieldsEscapedBy != "" { - for i, field := range fields { - if field == "\\N" { - fields[i] = "NULL" - } else if field == "\\Z" { - fields[i] = fmt.Sprintf("%c", 26) // ASCII 26 - } else if field == "\\0" { - fields[i] = fmt.Sprintf("%c", 0) // ASCII 0 - } else { - // The character immediately following the escaped character remains untouched, even if it is the same - // as the escape character - newField := make([]byte, 0, len(field)) - for cIdx := 0; cIdx < len(field); cIdx++ { - c := field[cIdx] - // skip over escaped character, but always add the following character - if c == l.fieldsEscapedBy[0] { - cIdx += 1 - if cIdx < len(field) { - newField = append(newField, c) - } - continue - } - newField = append(newField, c) - } - fields[i] = string(newField) + if isEscChar && i+1 < len(line) { + i++ + switch line[i] { + case 'N': + currentField.WriteString("NULL") + case 'Z': + currentField.WriteByte(26) + case '0': + currentField.WriteByte(0) + case 'n': + currentField.WriteByte('\n') + case 't': + currentField.WriteByte('\t') + case 'r': + currentField.WriteByte('\r') + case 'b': + currentField.WriteByte('\b') + default: + currentField.WriteByte(line[i]) } + continue + } + + // Handle field terminator (only outside enclosures) + if !inEnclosure && i+termLen <= len(line) && line[i:i+termLen] == l.fieldsTerminatedBy { + fields = append(fields, currentField.String()) + currentField.Reset() + i += termLen - 1 + continue } + + currentField.WriteByte(ch) + } + + lastField := currentField.String() + // If still in enclosure at EOF when enc==esc, prepend the opening enclosure that was stripped + if inEnclosure && !normalLineTerm { + lastField = string(l.fieldsEnclosedBy[0]) + lastField + } + fields = append(fields, lastField) + + if inEnclosure && normalLineTerm { + return nil, fmt.Errorf("error: unterminated enclosed field") } fieldRow := make(sql.Row, len(fields)) @@ -187,7 +239,7 @@ func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Express fieldRow[i] = field } - exprs := make([]sql.Expression, len(l.destSch)) + exprs = make([]sql.Expression, len(l.destSch)) for fieldIdx, exprIdx := 0, 0; fieldIdx < len(fields) && fieldIdx < len(l.userVars); fieldIdx++ { if l.userVars[fieldIdx] != nil { setField := l.userVars[fieldIdx].(*expression.SetField) @@ -220,19 +272,7 @@ func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Express } } case "NULL": - // For MySQL LOAD DATA semantics, \N (mapped to NULL here) should use the column default - // if one exists; otherwise insert NULL. - destIdx := l.fieldToColMap[fieldIdx] - if destIdx >= 0 { - destCol := l.destSch[destIdx] - if destCol.Default != nil { - exprs[exprIdx] = destCol.Default - } else { - exprs[exprIdx] = expression.NewLiteral(nil, types.Null) - } - } else { - exprs[exprIdx] = expression.NewLiteral(nil, types.Null) - } + exprs[exprIdx] = expression.NewLiteral(nil, types.Null) default: exprs[exprIdx] = expression.NewLiteral(field, types.LongText) }