Skip to content

Commit 2ec19cc

Browse files
committed
pkg/cli: Enhance TSV parsing in zip_upload_table_dumps
This commit introduces a new function, makeQuotedTSVIterator, to handle special TSV file parsing that has quoted fields containing newlines. For example create_statements.txt file which has SQL CREATE statements with embedded newlines and tabs. It also updates the existing makeTableIterator to correctly manage empty files. Part of: CRDB-51738 Epic: CRDB-52093 Release note: None
1 parent 6cd7729 commit 2ec19cc

File tree

2 files changed

+209
-21
lines changed

2 files changed

+209
-21
lines changed

pkg/cli/zip_upload_table_dumps.go

Lines changed: 152 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ import (
2727
"github.com/cockroachdb/errors"
2828
)
2929

30+
const (
31+
createStatementsFileName = "crdb_internal.create_statements.txt"
32+
)
33+
3034
type tsvColumnParserFn func(string) (any, error)
3135

3236
type columnParserMap map[string]tsvColumnParserFn
@@ -55,7 +59,7 @@ var clusterWideTableDumps = map[string]columnParserMap{
5559
"system.rangelog.txt": {},
5660
"crdb_internal.table_indexes.txt": {},
5761
"crdb_internal.index_usage_statistics.txt": {},
58-
"crdb_internal.create_statements.txt": {},
62+
createStatementsFileName: {},
5963
"system.job_info.txt": {},
6064
"crdb_internal.create_schema_statements.txt": {},
6165
"crdb_internal.default_privileges.txt": {},
@@ -253,11 +257,13 @@ func processTableDump(
253257
tags = append(tags, makeDDTag(nodeIDTag, strings.Split(fileName, "/")[1]))
254258
}
255259

256-
header, iter := makeTableIterator(f)
257-
if err := iter(func(row string) error {
258-
cols := strings.Split(row, "\t")
260+
// Common processing function for all the parsers
261+
processFields := func(header []string, cols []string) error {
259262
if len(header) != len(cols) {
260-
return errors.Newf("the number of headers is not matching the number of columns in the row")
263+
return errors.Newf(
264+
"the number of headers (%d) is not matching the number of columns (%d) in the row",
265+
len(header), len(cols),
266+
)
261267
}
262268

263269
headerColumnMapping := map[string]any{
@@ -302,8 +308,23 @@ func processTableDump(
302308

303309
lines = append(lines, jsonRow)
304310
return nil
305-
}); err != nil {
306-
return err
311+
}
312+
313+
var processErr error
314+
315+
iterMaker := makeTableIterator // default parser
316+
if strings.HasSuffix(fileName, createStatementsFileName) {
317+
// Using makeQuotedTSVIterator parser for create_statements.txt because it has
318+
// SQL CREATE statements with embedded newlines and tabs
319+
iterMaker = makeQuotedTSVIterator
320+
}
321+
header, iter := iterMaker(f)
322+
processErr = iter(func(cols []string) error {
323+
return processFields(header, cols)
324+
})
325+
326+
if processErr != nil {
327+
return processErr
307328
}
308329

309330
// flush the remaining lines if any
@@ -316,28 +337,37 @@ func processTableDump(
316337
return nil
317338
}
318339

319-
// makeTableIterator returns the headers slice and an iterator
320-
func makeTableIterator(f io.Reader) ([]string, func(func(string) error) error) {
340+
// makeTableIterator returns the headers slice and an iterator (original implementation for most files)
341+
func makeTableIterator(f io.Reader) ([]string, func(func([]string) error) error) {
321342
reader := bufio.NewReader(f)
322343

323344
// Read first line for headers
324345
headerLine, err := reader.ReadString('\n')
325346
if err != nil && err != io.EOF {
326-
return nil, func(func(string) error) error { return err }
347+
return nil, func(func([]string) error) error { return err }
327348
}
328349

329350
// Trim the newline character if present
330351
headerLine = strings.TrimSuffix(headerLine, "\n")
331-
headers := strings.Split(headerLine, "\t")
332352

333-
return headers, func(fn func(string) error) error {
353+
// Handle empty files correctly
354+
var headers []string
355+
if headerLine == "" {
356+
headers = []string{}
357+
} else {
358+
headers = strings.Split(headerLine, "\t")
359+
}
360+
361+
return headers, func(fn func([]string) error) error {
334362
for {
335363
line, err := reader.ReadString('\n')
336364
if err != nil {
337365
if err == io.EOF {
338366
// Process any remaining content before EOF
339367
if line != "" {
340-
if err := fn(strings.TrimSuffix(line, "\n")); err != nil {
368+
line = strings.TrimSuffix(line, "\n")
369+
cols := strings.Split(line, "\t")
370+
if err := fn(cols); err != nil {
341371
return err
342372
}
343373
}
@@ -346,16 +376,123 @@ func makeTableIterator(f io.Reader) ([]string, func(func(string) error) error) {
346376
return err
347377
}
348378

349-
// Trim the newline character
379+
// Trim the newline character and split into fields
350380
line = strings.TrimSuffix(line, "\n")
351-
if err := fn(line); err != nil {
381+
cols := strings.Split(line, "\t")
382+
if err := fn(cols); err != nil {
383+
return err
384+
}
385+
}
386+
return nil
387+
}
388+
}
389+
390+
// makeQuotedTSVIterator returns the headers slice and an iterator that properly handles
391+
// TSV files with quoted fields containing newlines. This function is ONLY used for
392+
// special cases where TSV fields contain complex content that spans multiple lines,
393+
// such as SQL CREATE statements with embedded newlines and tabs.
394+
//
395+
// When to use:
396+
// - Use this function when TSV fields are quoted and may contain newlines (e.g., crdb_internal.create_statements.txt)
397+
// - DO NOT use for regular TSV files - use makeTableIterator instead
398+
//
399+
// Example problem this solves:
400+
//
401+
// A TSV field containing: "CREATE TABLE foo (\n id INT,\n name STRING\n)"
402+
// Simple line-by-line parsing would incorrectly split this into multiple records,
403+
// but this function correctly treats it as a single field value.
404+
//
405+
// Unlike makeTableIterator, this function uses readTSVRecord to correctly parse
406+
// complex fields and returns parsed field slices for each record.
407+
func makeQuotedTSVIterator(f io.Reader) ([]string, func(func([]string) error) error) {
408+
reader := bufio.NewReader(f)
409+
410+
_, headers, err := readTSVRecord(reader)
411+
if err != nil {
412+
if err == io.EOF {
413+
return []string{}, func(func([]string) error) error { return nil }
414+
}
415+
return nil, func(func([]string) error) error { return err }
416+
}
417+
418+
if len(headers) == 0 {
419+
return headers, func(func([]string) error) error { return nil }
420+
}
421+
422+
return headers, func(fn func([]string) error) error {
423+
for {
424+
_, fields, err := readTSVRecord(reader)
425+
if err != nil {
426+
if err == io.EOF {
427+
break
428+
}
429+
return err
430+
}
431+
432+
if len(fields) == 0 {
433+
break
434+
}
435+
436+
if err := fn(fields); err != nil {
352437
return err
353438
}
354439
}
355440
return nil
356441
}
357442
}
358443

444+
// readTSVRecord reads a complete TSV record from a buffered reader, properly handling
445+
// quoted fields that may contain embedded newlines and tab characters. Unlike simple
446+
// line-by-line parsing, this function tracks quote state to correctly parse fields
447+
// that span multiple lines. Returns the complete raw line, parsed field values as a
448+
// slice, and any error encountered during reading.
449+
func readTSVRecord(reader *bufio.Reader) (string, []string, error) {
450+
var line strings.Builder
451+
var fields []string
452+
var currentField strings.Builder
453+
inQuotes := false
454+
hasContent := false
455+
456+
for {
457+
b, err := reader.ReadByte()
458+
if err != nil {
459+
if err == io.EOF {
460+
if hasContent && (line.Len() > 0 || currentField.Len() > 0) {
461+
fields = append(fields, currentField.String())
462+
return line.String(), fields, nil
463+
}
464+
return "", fields, io.EOF
465+
}
466+
return "", nil, err
467+
}
468+
469+
hasContent = true
470+
line.WriteByte(b)
471+
472+
switch b {
473+
case '"':
474+
inQuotes = !inQuotes
475+
currentField.WriteByte(b)
476+
case '\t':
477+
if inQuotes {
478+
currentField.WriteByte(b)
479+
} else {
480+
fields = append(fields, currentField.String())
481+
currentField.Reset()
482+
}
483+
case '\n':
484+
if inQuotes {
485+
currentField.WriteByte(b)
486+
} else {
487+
fields = append(fields, currentField.String())
488+
return line.String(), fields, nil
489+
}
490+
default:
491+
currentField.WriteByte(b)
492+
}
493+
}
494+
}
495+
359496
func getNodeSpecificTableDumps(debugDirPath string) ([]string, error) {
360497
allTxtFiles, err := expandPatterns([]string{path.Join(debugDirPath, zippedNodeTableDumpsPattern)})
361498
if err != nil {

pkg/cli/zip_upload_table_dumps_test.go

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,65 @@ func TestMakeTableIterator(t *testing.T) {
100100
)),
101101
headers: []string{"h1"},
102102
},
103+
{
104+
name: "empty file",
105+
input: bytes.NewBufferString(""),
106+
headers: []string{},
107+
},
108+
{
109+
name: "header only",
110+
input: bytes.NewBufferString("h1\th2\th3\n"),
111+
headers: []string{"h1", "h2", "h3"},
112+
},
103113
}
104114

105115
for _, tc := range tt {
106-
headers, iter := makeTableIterator(tc.input)
107-
assert.Equal(t, tc.headers, headers)
108-
require.NoError(t, iter(func(row string) error {
109-
assert.Len(t, strings.Split(row, "\t"), len(headers))
110-
return nil
111-
}))
116+
t.Run("Old_"+tc.name, func(t *testing.T) {
117+
headers, iter := makeTableIterator(tc.input)
118+
assert.Equal(t, tc.headers, headers)
119+
require.NoError(t, iter(func(fields []string) error {
120+
assert.Len(t, fields, len(headers))
121+
return nil
122+
}))
123+
})
124+
}
125+
126+
// Test cases specifically for the quoted TSV parser with quoted fields
127+
quotedTSVTestCases := []struct {
128+
name string
129+
input io.Reader
130+
headers []string
131+
}{
132+
{
133+
name: "simple",
134+
input: bytes.NewBufferString("h1\th2\th3\nr1c1\tr1c2\tr1c3\nr2c1\tr2c2\tr2c3\n"),
135+
headers: []string{"h1", "h2", "h3"},
136+
},
137+
{
138+
name: "quoted fields with newlines",
139+
input: bytes.NewBufferString("col1\tcol2\tcol3\nval1\t\"CREATE TABLE test (\n\tid INT,\n\tname STRING\n)\"\tval3\n"),
140+
headers: []string{"col1", "col2", "col3"},
141+
},
142+
{
143+
name: "empty file",
144+
input: bytes.NewBufferString(""),
145+
headers: []string{},
146+
},
147+
{
148+
name: "header only",
149+
input: bytes.NewBufferString("h1\th2\th3\n"),
150+
headers: []string{"h1", "h2", "h3"},
151+
},
152+
}
153+
154+
for _, tc := range quotedTSVTestCases {
155+
t.Run("QuotedTSV_"+tc.name, func(t *testing.T) {
156+
headers, iter := makeQuotedTSVIterator(tc.input)
157+
assert.Equal(t, tc.headers, headers)
158+
require.NoError(t, iter(func(fields []string) error {
159+
assert.Len(t, fields, len(headers))
160+
return nil
161+
}))
162+
})
112163
}
113164
}

0 commit comments

Comments
 (0)