Skip to content

Commit 0b3ea4b

Browse files
committed
impl ch-by-ch parsing for quoted fields
1 parent 1b6b2ba commit 0b3ea4b

File tree

4 files changed

+167
-52
lines changed

4 files changed

+167
-52
lines changed

enginetest/memory_engine_test.go

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -200,28 +200,22 @@ func TestSingleQueryPrepared(t *testing.T) {
200200

201201
// Convenience test for debugging a single query. Unskip and set to the desired query.
202202
func TestSingleScript(t *testing.T) {
203-
//t.Skip()
203+
t.Skip()
204204
var scripts = []queries.ScriptTest{
205205
{
206-
Name: "Dolt diff query returns correct tables (regression 1.59.18)",
207-
SetUpScript: []string{
208-
"SET GLOBAL local_infile=1;",
209-
`CREATE TABLE my_table (
210-
id int NOT NULL,
211-
txt varchar(16) NOT NULL,
212-
PRIMARY KEY (id)
213-
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;`,
214-
`LOAD DATA INFILE 'C:/Users/Elian/dolt_workspace/db/9969/data'
215-
INTO TABLE my_table
216-
FIELDS TERMINATED BY ',' ENCLOSED BY '"' ESCAPED BY '"'
217-
LINES TERMINATED BY '\n'
218-
(id, txt);`,
219-
},
206+
Name: "AS OF propagates to nested CALLs",
207+
SetUpScript: []string{},
220208
Assertions: []queries.ScriptTestAssertion{
221209
{
222-
Query: `SELECT * FROM my_table LIMIT 1`,
210+
Query: "create procedure create_proc() create table t (i int primary key, j int);",
211+
Expected: []sql.Row{
212+
{types.NewOkResult(0)},
213+
},
214+
},
215+
{
216+
Query: "call create_proc()",
223217
Expected: []sql.Row{
224-
{1, "foo,bar"},
218+
{types.NewOkResult(0)},
225219
},
226220
},
227221
},

enginetest/queries/load_queries.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,68 @@ import (
2424
)
2525

2626
var LoadDataScripts = []ScriptTest{
27+
{
28+
// https://github.com/dolthub/dolt/issues/9969
29+
Name: "LOAD DATA with ENCLOSED BY and ESCAPED BY parsing",
30+
SetUpScript: []string{
31+
"create table t1(pk int primary key, c1 longtext)",
32+
"LOAD DATA INFILE './testdata/loaddata_9969.dat' INTO TABLE t1 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\"'",
33+
"create table t2(pk int primary key, c1 longtext)",
34+
"LOAD DATA INFILE './testdata/loaddata_escape.dat' INTO TABLE t2 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\\\\'",
35+
"create table t3(a varchar(20), b varchar(20))",
36+
"LOAD DATA INFILE './testdata/loaddata_enclosed.dat' INTO TABLE t3 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\"'",
37+
"create table t4(a varchar(20), b varchar(20))",
38+
"LOAD DATA INFILE './testdata/loaddata_mixed_escapes.dat' INTO TABLE t4 FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\\\\'",
39+
"create table t5(a text, b text)",
40+
"LOAD DATA INFILE './testdata/loaddata_single_quotes.dat' INTO TABLE t5 FIELDS TERMINATED BY ',' ENCLOSED BY ''''",
41+
"create table t6(pk int, a varchar(20), b varchar(20))",
42+
"LOAD DATA INFILE './testdata/loaddata_nulls.dat' INTO TABLE t6 FIELDS TERMINATED BY ','",
43+
},
44+
Assertions: []ScriptTestAssertion{
45+
{
46+
Query: "select * from t1",
47+
Expected: []sql.Row{{1, "foo,bar"}},
48+
},
49+
{
50+
Query: "select * from t2",
51+
Expected: []sql.Row{{1, "foo,bar"}},
52+
},
53+
{
54+
Query: "select * from t3 ORDER BY a",
55+
Expected: []sql.Row{
56+
{"a\"b", "cd\"ef"},
57+
{"field1", "field2"},
58+
{"foo,bar", "baz,qux"},
59+
},
60+
},
61+
{
62+
Query: "select * from t4",
63+
Expected: []sql.Row{
64+
{nil, "\x1A"},
65+
{"a,b", "c,d"},
66+
{"hello\nworld", "foo\tbar"},
67+
},
68+
},
69+
{
70+
Query: "select * from t5", // order by a breaks
71+
Expected: []sql.Row{
72+
{"Field A", "Field B"},
73+
{"Field 1", "Field 2"},
74+
{"Field 3", "Field 4"},
75+
{"Field 5", "Field 6"},
76+
},
77+
},
78+
{
79+
Query: "select * from t6 ORDER BY pk",
80+
Expected: []sql.Row{
81+
{1, "hello", "world"},
82+
{2, nil, "test"},
83+
{3, "", "empty"},
84+
{4, nil, nil},
85+
},
86+
},
87+
},
88+
},
2789
{
2890
Name: "LOAD DATA applies column defaults when \\N provided",
2991
SetUpScript: []string{
@@ -128,6 +190,7 @@ var LoadDataScripts = []ScriptTest{
128190
},
129191
},
130192
},
193+
// https://github.com/dolthub/dolt/issues/9969
131194
{
132195
Name: "Load JSON data. EnclosedBy and EscapedBy are the same.",
133196
SetUpScript: []string{
File renamed without changes.

sql/rowexec/ddl_iters.go

Lines changed: 93 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -131,54 +131,112 @@ func (l *loadDataIter) parseLinePrefix(line string) string {
131131
}
132132

133133
func (l *loadDataIter) parseFields(ctx *sql.Context, line string) ([]sql.Expression, error) {
134-
// Step 1. Start by Searching for prefix if there is one
134+
// Start by searching for prefix if there is one
135135
line = l.parseLinePrefix(line)
136136
if line == "" {
137137
return nil, nil
138138
}
139139

140-
// Step 2: Split the lines into fields given the delim
141-
fields := strings.Split(line, l.fieldsTerminatedBy)
140+
// Split the line into fields. When ENCLOSED BY is specified, fields must be parsed
141+
// character-by-character to respect quoted fields that may contain the field terminator.
142+
var fields []string
143+
if l.fieldsEnclosedBy == "" {
144+
fields = strings.Split(line, l.fieldsTerminatedBy)
145+
} else {
146+
var currentField strings.Builder
147+
inEnclosure := false
148+
encChar := l.fieldsEnclosedBy[0]
149+
escChar := byte(0)
150+
if l.fieldsEscapedBy != "" {
151+
escChar = l.fieldsEscapedBy[0]
152+
}
153+
termLen := len(l.fieldsTerminatedBy)
154+
155+
for i := 0; i < len(line); i++ {
156+
c := line[i]
157+
158+
// Handle enclosure character
159+
if c == encChar {
160+
if inEnclosure {
161+
// Check for doubled enclosure (escape mechanism when encChar == escChar)
162+
if i+1 < len(line) && line[i+1] == encChar {
163+
currentField.WriteByte(encChar)
164+
i++
165+
continue
166+
}
167+
inEnclosure = false
168+
continue
169+
}
170+
if currentField.Len() == 0 {
171+
inEnclosure = true
172+
continue
173+
}
174+
}
142175

143-
// Step 3: Go through each field and see if it was enclosed by something
144-
// TODO: Support the OPTIONALLY parameter.
145-
if l.fieldsEnclosedBy != "" {
146-
for i, field := range fields {
147-
if field[0] == l.fieldsEnclosedBy[0] && field[len(field)-1] == l.fieldsEnclosedBy[0] {
148-
fields[i] = field[1 : len(field)-1]
149-
} else {
150-
return nil, fmt.Errorf("error: field not properly enclosed")
176+
// Handle escape character (only when different from enclosure character)
177+
if escChar != 0 && escChar != encChar && c == escChar && i+1 < len(line) {
178+
currentField.WriteByte(c)
179+
i++
180+
currentField.WriteByte(line[i])
181+
continue
182+
}
183+
184+
// Handle field terminator (only outside enclosures)
185+
if !inEnclosure && i+termLen <= len(line) && line[i:i+termLen] == l.fieldsTerminatedBy {
186+
fields = append(fields, currentField.String())
187+
currentField.Reset()
188+
i += termLen - 1
189+
continue
151190
}
191+
192+
currentField.WriteByte(c)
193+
}
194+
195+
fields = append(fields, currentField.String())
196+
if !l.fieldsEnclosedByOpt && inEnclosure {
197+
return nil, fmt.Errorf("error: unterminated enclosed field")
152198
}
153199
}
154200

155-
// Step 4: Handle the ESCAPED BY parameter.
156-
if l.fieldsEscapedBy != "" {
201+
// Handle ESCAPED BY parameter for special sequences like \N, \Z, \0, \n, \t, etc.
202+
// When ESCAPED BY equals ENCLOSED BY, escaping was already handled via doubling.
203+
if l.fieldsEscapedBy != "" && l.fieldsEscapedBy != l.fieldsEnclosedBy {
204+
escByte := l.fieldsEscapedBy[0]
157205
for i, field := range fields {
158-
if field == "\\N" {
159-
fields[i] = "NULL"
160-
} else if field == "\\Z" {
161-
fields[i] = fmt.Sprintf("%c", 26) // ASCII 26
162-
} else if field == "\\0" {
163-
fields[i] = fmt.Sprintf("%c", 0) // ASCII 0
164-
} else {
165-
// The character immediately following the escaped character remains untouched, even if it is the same
166-
// as the escape character
167-
newField := make([]byte, 0, len(field))
168-
for cIdx := 0; cIdx < len(field); cIdx++ {
169-
c := field[cIdx]
170-
// skip over escaped character, but always add the following character
171-
if c == l.fieldsEscapedBy[0] {
172-
cIdx += 1
173-
if cIdx < len(field) {
174-
newField = append(newField, c)
175-
}
176-
continue
177-
}
178-
newField = append(newField, c)
206+
if !strings.ContainsRune(field, rune(escByte)) {
207+
continue
208+
}
209+
210+
newField := make([]byte, 0, len(field))
211+
for j := 0; j < len(field); j++ {
212+
if field[j] != escByte || j+1 >= len(field) {
213+
newField = append(newField, field[j])
214+
continue
215+
}
216+
217+
j++
218+
switch field[j] {
219+
case 'N':
220+
fields[i] = "NULL"
221+
goto nextField
222+
case 'Z':
223+
newField = append(newField, 26)
224+
case '0':
225+
newField = append(newField, 0)
226+
case 'n':
227+
newField = append(newField, '\n')
228+
case 't':
229+
newField = append(newField, '\t')
230+
case 'r':
231+
newField = append(newField, '\r')
232+
case 'b':
233+
newField = append(newField, '\b')
234+
default:
235+
newField = append(newField, field[j])
179236
}
180-
fields[i] = string(newField)
181237
}
238+
fields[i] = string(newField)
239+
nextField:
182240
}
183241
}
184242

0 commit comments

Comments
 (0)