@@ -27,6 +27,10 @@ import (
27
27
"github.com/cockroachdb/errors"
28
28
)
29
29
30
+ const (
31
+ createStatementsFileName = "crdb_internal.create_statements.txt"
32
+ )
33
+
30
34
type tsvColumnParserFn func (string ) (any , error )
31
35
32
36
type columnParserMap map [string ]tsvColumnParserFn
@@ -55,7 +59,7 @@ var clusterWideTableDumps = map[string]columnParserMap{
55
59
"system.rangelog.txt" : {},
56
60
"crdb_internal.table_indexes.txt" : {},
57
61
"crdb_internal.index_usage_statistics.txt" : {},
58
- "crdb_internal.create_statements.txt" : {},
62
+ createStatementsFileName : {},
59
63
"system.job_info.txt" : {},
60
64
"crdb_internal.create_schema_statements.txt" : {},
61
65
"crdb_internal.default_privileges.txt" : {},
@@ -253,11 +257,13 @@ func processTableDump(
253
257
tags = append (tags , makeDDTag (nodeIDTag , strings .Split (fileName , "/" )[1 ]))
254
258
}
255
259
256
- header , iter := makeTableIterator (f )
257
- if err := iter (func (row string ) error {
258
- cols := strings .Split (row , "\t " )
260
+ // Common processing function for all the parsers
261
+ processFields := func (header []string , cols []string ) error {
259
262
if len (header ) != len (cols ) {
260
- return errors .Newf ("the number of headers is not matching the number of columns in the row" )
263
+ return errors .Newf (
264
+ "the number of headers (%d) is not matching the number of columns (%d) in the row" ,
265
+ len (header ), len (cols ),
266
+ )
261
267
}
262
268
263
269
headerColumnMapping := map [string ]any {
@@ -302,8 +308,23 @@ func processTableDump(
302
308
303
309
lines = append (lines , jsonRow )
304
310
return nil
305
- }); err != nil {
306
- return err
311
+ }
312
+
313
+ var processErr error
314
+
315
+ iterMaker := makeTableIterator // default parser
316
+ if strings .HasSuffix (fileName , createStatementsFileName ) {
317
+ // Using makeQuotedTSVIterator parser for create_statements.txt because it has
318
+ // SQL CREATE statements with embedded newlines and tabs
319
+ iterMaker = makeQuotedTSVIterator
320
+ }
321
+ header , iter := iterMaker (f )
322
+ processErr = iter (func (cols []string ) error {
323
+ return processFields (header , cols )
324
+ })
325
+
326
+ if processErr != nil {
327
+ return processErr
307
328
}
308
329
309
330
// flush the remaining lines if any
@@ -316,28 +337,37 @@ func processTableDump(
316
337
return nil
317
338
}
318
339
319
- // makeTableIterator returns the headers slice and an iterator
320
- func makeTableIterator (f io.Reader ) ([]string , func (func (string ) error ) error ) {
340
+ // makeTableIterator returns the headers slice and an iterator (original implementation for most files)
341
+ func makeTableIterator (f io.Reader ) ([]string , func (func ([] string ) error ) error ) {
321
342
reader := bufio .NewReader (f )
322
343
323
344
// Read first line for headers
324
345
headerLine , err := reader .ReadString ('\n' )
325
346
if err != nil && err != io .EOF {
326
- return nil , func (func (string ) error ) error { return err }
347
+ return nil , func (func ([] string ) error ) error { return err }
327
348
}
328
349
329
350
// Trim the newline character if present
330
351
headerLine = strings .TrimSuffix (headerLine , "\n " )
331
- headers := strings .Split (headerLine , "\t " )
332
352
333
- return headers , func (fn func (string ) error ) error {
353
+ // Handle empty files correctly
354
+ var headers []string
355
+ if headerLine == "" {
356
+ headers = []string {}
357
+ } else {
358
+ headers = strings .Split (headerLine , "\t " )
359
+ }
360
+
361
+ return headers , func (fn func ([]string ) error ) error {
334
362
for {
335
363
line , err := reader .ReadString ('\n' )
336
364
if err != nil {
337
365
if err == io .EOF {
338
366
// Process any remaining content before EOF
339
367
if line != "" {
340
- if err := fn (strings .TrimSuffix (line , "\n " )); err != nil {
368
+ line = strings .TrimSuffix (line , "\n " )
369
+ cols := strings .Split (line , "\t " )
370
+ if err := fn (cols ); err != nil {
341
371
return err
342
372
}
343
373
}
@@ -346,16 +376,123 @@ func makeTableIterator(f io.Reader) ([]string, func(func(string) error) error) {
346
376
return err
347
377
}
348
378
349
- // Trim the newline character
379
+ // Trim the newline character and split into fields
350
380
line = strings .TrimSuffix (line , "\n " )
351
- if err := fn (line ); err != nil {
381
+ cols := strings .Split (line , "\t " )
382
+ if err := fn (cols ); err != nil {
383
+ return err
384
+ }
385
+ }
386
+ return nil
387
+ }
388
+ }
389
+
390
+ // makeQuotedTSVIterator returns the headers slice and an iterator that properly handles
391
+ // TSV files with quoted fields containing newlines. This function is ONLY used for
392
+ // special cases where TSV fields contain complex content that spans multiple lines,
393
+ // such as SQL CREATE statements with embedded newlines and tabs.
394
+ //
395
+ // When to use:
396
+ // - Use this function when TSV fields are quoted and may contain newlines (e.g., crdb_internal.create_statements.txt)
397
+ // - DO NOT use for regular TSV files - use makeTableIterator instead
398
+ //
399
+ // Example problem this solves:
400
+ //
401
+ // A TSV field containing: "CREATE TABLE foo (\n id INT,\n name STRING\n)"
402
+ // Simple line-by-line parsing would incorrectly split this into multiple records,
403
+ // but this function correctly treats it as a single field value.
404
+ //
405
+ // Unlike makeTableIterator, this function uses readTSVRecord to correctly parse
406
+ // complex fields and returns parsed field slices for each record.
407
+ func makeQuotedTSVIterator (f io.Reader ) ([]string , func (func ([]string ) error ) error ) {
408
+ reader := bufio .NewReader (f )
409
+
410
+ _ , headers , err := readTSVRecord (reader )
411
+ if err != nil {
412
+ if err == io .EOF {
413
+ return []string {}, func (func ([]string ) error ) error { return nil }
414
+ }
415
+ return nil , func (func ([]string ) error ) error { return err }
416
+ }
417
+
418
+ if len (headers ) == 0 {
419
+ return headers , func (func ([]string ) error ) error { return nil }
420
+ }
421
+
422
+ return headers , func (fn func ([]string ) error ) error {
423
+ for {
424
+ _ , fields , err := readTSVRecord (reader )
425
+ if err != nil {
426
+ if err == io .EOF {
427
+ break
428
+ }
429
+ return err
430
+ }
431
+
432
+ if len (fields ) == 0 {
433
+ break
434
+ }
435
+
436
+ if err := fn (fields ); err != nil {
352
437
return err
353
438
}
354
439
}
355
440
return nil
356
441
}
357
442
}
358
443
444
+ // readTSVRecord reads a complete TSV record from a buffered reader, properly handling
445
+ // quoted fields that may contain embedded newlines and tab characters. Unlike simple
446
+ // line-by-line parsing, this function tracks quote state to correctly parse fields
447
+ // that span multiple lines. Returns the complete raw line, parsed field values as a
448
+ // slice, and any error encountered during reading.
449
+ func readTSVRecord (reader * bufio.Reader ) (string , []string , error ) {
450
+ var line strings.Builder
451
+ var fields []string
452
+ var currentField strings.Builder
453
+ inQuotes := false
454
+ hasContent := false
455
+
456
+ for {
457
+ b , err := reader .ReadByte ()
458
+ if err != nil {
459
+ if err == io .EOF {
460
+ if hasContent && (line .Len () > 0 || currentField .Len () > 0 ) {
461
+ fields = append (fields , currentField .String ())
462
+ return line .String (), fields , nil
463
+ }
464
+ return "" , fields , io .EOF
465
+ }
466
+ return "" , nil , err
467
+ }
468
+
469
+ hasContent = true
470
+ line .WriteByte (b )
471
+
472
+ switch b {
473
+ case '"' :
474
+ inQuotes = ! inQuotes
475
+ currentField .WriteByte (b )
476
+ case '\t' :
477
+ if inQuotes {
478
+ currentField .WriteByte (b )
479
+ } else {
480
+ fields = append (fields , currentField .String ())
481
+ currentField .Reset ()
482
+ }
483
+ case '\n' :
484
+ if inQuotes {
485
+ currentField .WriteByte (b )
486
+ } else {
487
+ fields = append (fields , currentField .String ())
488
+ return line .String (), fields , nil
489
+ }
490
+ default :
491
+ currentField .WriteByte (b )
492
+ }
493
+ }
494
+ }
495
+
359
496
func getNodeSpecificTableDumps (debugDirPath string ) ([]string , error ) {
360
497
allTxtFiles , err := expandPatterns ([]string {path .Join (debugDirPath , zippedNodeTableDumpsPattern )})
361
498
if err != nil {
0 commit comments