Skip to content

Commit 7a762aa

Browse files
craig[bot]Abhinav1299
andcommitted
Merge #149394
149394: pkg/cli: Enhance TSV parsing in zip_upload_table_dumps r=Abhinav1299 a=Abhinav1299 This commit introduces a new function, makeQuotedTSVIterator, to handle TSV files with quoted fields containing newlines. This is used to correctly parse complex fields such as SQL statements that spans multiple lines. It also updates the existing makeTableIterator to correctly manage empty files. The processTableDump function is modified to utilize the new parser for specific files, improving robustness in handling complex TSV formats. Part of: CRDB-51738 Epic: CRDB-52093 Release note: None Co-authored-by: Abhinav1299 <[email protected]>
2 parents b0ad230 + 2ec19cc commit 7a762aa

File tree

2 files changed

+209
-21
lines changed

2 files changed

+209
-21
lines changed

pkg/cli/zip_upload_table_dumps.go

Lines changed: 152 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ import (
2727
"github.com/cockroachdb/errors"
2828
)
2929

30+
const (
31+
createStatementsFileName = "crdb_internal.create_statements.txt"
32+
)
33+
3034
type tsvColumnParserFn func(string) (any, error)
3135

3236
type columnParserMap map[string]tsvColumnParserFn
@@ -55,7 +59,7 @@ var clusterWideTableDumps = map[string]columnParserMap{
5559
"system.rangelog.txt": {},
5660
"crdb_internal.table_indexes.txt": {},
5761
"crdb_internal.index_usage_statistics.txt": {},
58-
"crdb_internal.create_statements.txt": {},
62+
createStatementsFileName: {},
5963
"system.job_info.txt": {},
6064
"crdb_internal.create_schema_statements.txt": {},
6165
"crdb_internal.default_privileges.txt": {},
@@ -253,11 +257,13 @@ func processTableDump(
253257
tags = append(tags, makeDDTag(nodeIDTag, strings.Split(fileName, "/")[1]))
254258
}
255259

256-
header, iter := makeTableIterator(f)
257-
if err := iter(func(row string) error {
258-
cols := strings.Split(row, "\t")
260+
// Common processing function for all the parsers
261+
processFields := func(header []string, cols []string) error {
259262
if len(header) != len(cols) {
260-
return errors.Newf("the number of headers is not matching the number of columns in the row")
263+
return errors.Newf(
264+
"the number of headers (%d) is not matching the number of columns (%d) in the row",
265+
len(header), len(cols),
266+
)
261267
}
262268

263269
headerColumnMapping := map[string]any{
@@ -302,8 +308,23 @@ func processTableDump(
302308

303309
lines = append(lines, jsonRow)
304310
return nil
305-
}); err != nil {
306-
return err
311+
}
312+
313+
var processErr error
314+
315+
iterMaker := makeTableIterator // default parser
316+
if strings.HasSuffix(fileName, createStatementsFileName) {
317+
// Using makeQuotedTSVIterator parser for create_statements.txt because it has
318+
// SQL CREATE statements with embedded newlines and tabs
319+
iterMaker = makeQuotedTSVIterator
320+
}
321+
header, iter := iterMaker(f)
322+
processErr = iter(func(cols []string) error {
323+
return processFields(header, cols)
324+
})
325+
326+
if processErr != nil {
327+
return processErr
307328
}
308329

309330
// flush the remaining lines if any
@@ -316,28 +337,37 @@ func processTableDump(
316337
return nil
317338
}
318339

319-
// makeTableIterator returns the headers slice and an iterator
320-
func makeTableIterator(f io.Reader) ([]string, func(func(string) error) error) {
340+
// makeTableIterator returns the headers slice and an iterator (original implementation for most files)
341+
func makeTableIterator(f io.Reader) ([]string, func(func([]string) error) error) {
321342
reader := bufio.NewReader(f)
322343

323344
// Read first line for headers
324345
headerLine, err := reader.ReadString('\n')
325346
if err != nil && err != io.EOF {
326-
return nil, func(func(string) error) error { return err }
347+
return nil, func(func([]string) error) error { return err }
327348
}
328349

329350
// Trim the newline character if present
330351
headerLine = strings.TrimSuffix(headerLine, "\n")
331-
headers := strings.Split(headerLine, "\t")
332352

333-
return headers, func(fn func(string) error) error {
353+
// Handle empty files correctly
354+
var headers []string
355+
if headerLine == "" {
356+
headers = []string{}
357+
} else {
358+
headers = strings.Split(headerLine, "\t")
359+
}
360+
361+
return headers, func(fn func([]string) error) error {
334362
for {
335363
line, err := reader.ReadString('\n')
336364
if err != nil {
337365
if err == io.EOF {
338366
// Process any remaining content before EOF
339367
if line != "" {
340-
if err := fn(strings.TrimSuffix(line, "\n")); err != nil {
368+
line = strings.TrimSuffix(line, "\n")
369+
cols := strings.Split(line, "\t")
370+
if err := fn(cols); err != nil {
341371
return err
342372
}
343373
}
@@ -346,16 +376,123 @@ func makeTableIterator(f io.Reader) ([]string, func(func(string) error) error) {
346376
return err
347377
}
348378

349-
// Trim the newline character
379+
// Trim the newline character and split into fields
350380
line = strings.TrimSuffix(line, "\n")
351-
if err := fn(line); err != nil {
381+
cols := strings.Split(line, "\t")
382+
if err := fn(cols); err != nil {
383+
return err
384+
}
385+
}
386+
return nil
387+
}
388+
}
389+
390+
// makeQuotedTSVIterator returns the headers slice and an iterator that properly handles
391+
// TSV files with quoted fields containing newlines. This function is ONLY used for
392+
// special cases where TSV fields contain complex content that spans multiple lines,
393+
// such as SQL CREATE statements with embedded newlines and tabs.
394+
//
395+
// When to use:
396+
// - Use this function when TSV fields are quoted and may contain newlines (e.g., crdb_internal.create_statements.txt)
397+
// - DO NOT use for regular TSV files - use makeTableIterator instead
398+
//
399+
// Example problem this solves:
400+
//
401+
// A TSV field containing: "CREATE TABLE foo (\n id INT,\n name STRING\n)"
402+
// Simple line-by-line parsing would incorrectly split this into multiple records,
403+
// but this function correctly treats it as a single field value.
404+
//
405+
// Unlike makeTableIterator, this function uses readTSVRecord to correctly parse
406+
// complex fields and returns parsed field slices for each record.
407+
func makeQuotedTSVIterator(f io.Reader) ([]string, func(func([]string) error) error) {
408+
reader := bufio.NewReader(f)
409+
410+
_, headers, err := readTSVRecord(reader)
411+
if err != nil {
412+
if err == io.EOF {
413+
return []string{}, func(func([]string) error) error { return nil }
414+
}
415+
return nil, func(func([]string) error) error { return err }
416+
}
417+
418+
if len(headers) == 0 {
419+
return headers, func(func([]string) error) error { return nil }
420+
}
421+
422+
return headers, func(fn func([]string) error) error {
423+
for {
424+
_, fields, err := readTSVRecord(reader)
425+
if err != nil {
426+
if err == io.EOF {
427+
break
428+
}
429+
return err
430+
}
431+
432+
if len(fields) == 0 {
433+
break
434+
}
435+
436+
if err := fn(fields); err != nil {
352437
return err
353438
}
354439
}
355440
return nil
356441
}
357442
}
358443

444+
// readTSVRecord reads a complete TSV record from a buffered reader, properly handling
445+
// quoted fields that may contain embedded newlines and tab characters. Unlike simple
446+
// line-by-line parsing, this function tracks quote state to correctly parse fields
447+
// that span multiple lines. Returns the complete raw line, parsed field values as a
448+
// slice, and any error encountered during reading.
449+
func readTSVRecord(reader *bufio.Reader) (string, []string, error) {
450+
var line strings.Builder
451+
var fields []string
452+
var currentField strings.Builder
453+
inQuotes := false
454+
hasContent := false
455+
456+
for {
457+
b, err := reader.ReadByte()
458+
if err != nil {
459+
if err == io.EOF {
460+
if hasContent && (line.Len() > 0 || currentField.Len() > 0) {
461+
fields = append(fields, currentField.String())
462+
return line.String(), fields, nil
463+
}
464+
return "", fields, io.EOF
465+
}
466+
return "", nil, err
467+
}
468+
469+
hasContent = true
470+
line.WriteByte(b)
471+
472+
switch b {
473+
case '"':
474+
inQuotes = !inQuotes
475+
currentField.WriteByte(b)
476+
case '\t':
477+
if inQuotes {
478+
currentField.WriteByte(b)
479+
} else {
480+
fields = append(fields, currentField.String())
481+
currentField.Reset()
482+
}
483+
case '\n':
484+
if inQuotes {
485+
currentField.WriteByte(b)
486+
} else {
487+
fields = append(fields, currentField.String())
488+
return line.String(), fields, nil
489+
}
490+
default:
491+
currentField.WriteByte(b)
492+
}
493+
}
494+
}
495+
359496
func getNodeSpecificTableDumps(debugDirPath string) ([]string, error) {
360497
allTxtFiles, err := expandPatterns([]string{path.Join(debugDirPath, zippedNodeTableDumpsPattern)})
361498
if err != nil {

pkg/cli/zip_upload_table_dumps_test.go

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,65 @@ func TestMakeTableIterator(t *testing.T) {
100100
)),
101101
headers: []string{"h1"},
102102
},
103+
{
104+
name: "empty file",
105+
input: bytes.NewBufferString(""),
106+
headers: []string{},
107+
},
108+
{
109+
name: "header only",
110+
input: bytes.NewBufferString("h1\th2\th3\n"),
111+
headers: []string{"h1", "h2", "h3"},
112+
},
103113
}
104114

105115
for _, tc := range tt {
106-
headers, iter := makeTableIterator(tc.input)
107-
assert.Equal(t, tc.headers, headers)
108-
require.NoError(t, iter(func(row string) error {
109-
assert.Len(t, strings.Split(row, "\t"), len(headers))
110-
return nil
111-
}))
116+
t.Run("Old_"+tc.name, func(t *testing.T) {
117+
headers, iter := makeTableIterator(tc.input)
118+
assert.Equal(t, tc.headers, headers)
119+
require.NoError(t, iter(func(fields []string) error {
120+
assert.Len(t, fields, len(headers))
121+
return nil
122+
}))
123+
})
124+
}
125+
126+
// Test cases specifically for the quoted TSV parser with quoted fields
127+
quotedTSVTestCases := []struct {
128+
name string
129+
input io.Reader
130+
headers []string
131+
}{
132+
{
133+
name: "simple",
134+
input: bytes.NewBufferString("h1\th2\th3\nr1c1\tr1c2\tr1c3\nr2c1\tr2c2\tr2c3\n"),
135+
headers: []string{"h1", "h2", "h3"},
136+
},
137+
{
138+
name: "quoted fields with newlines",
139+
input: bytes.NewBufferString("col1\tcol2\tcol3\nval1\t\"CREATE TABLE test (\n\tid INT,\n\tname STRING\n)\"\tval3\n"),
140+
headers: []string{"col1", "col2", "col3"},
141+
},
142+
{
143+
name: "empty file",
144+
input: bytes.NewBufferString(""),
145+
headers: []string{},
146+
},
147+
{
148+
name: "header only",
149+
input: bytes.NewBufferString("h1\th2\th3\n"),
150+
headers: []string{"h1", "h2", "h3"},
151+
},
152+
}
153+
154+
for _, tc := range quotedTSVTestCases {
155+
t.Run("QuotedTSV_"+tc.name, func(t *testing.T) {
156+
headers, iter := makeQuotedTSVIterator(tc.input)
157+
assert.Equal(t, tc.headers, headers)
158+
require.NoError(t, iter(func(fields []string) error {
159+
assert.Len(t, fields, len(headers))
160+
return nil
161+
}))
162+
})
112163
}
113164
}

0 commit comments

Comments
 (0)