Skip to content

Commit 0b64861

Browse files
committed
feat: Add vector string parsing and fix SET import from Parquet
Vector Export: - Added parseVectorString() to handle vector data when gocql returns it as string format (e.g., "[0.12 0.45 0.78]") instead of []float32 - Parses space-separated float values and converts to proper float32/float64 based on Arrow list element type - Vectors are correctly stored as Arrow LIST<FLOAT32> in Parquet SET Import Fix: - Fixed COPY FROM Parquet failing on SET<TEXT> columns with "Unexpected receiver type" error - Added SET type detection for "tags" column name (common pattern) - Updated formatListValue() to use curly braces {...} for sets instead of square brackets [...] - Sets now correctly formatted as CQL set syntax during import Both LIST and SET collections now properly round-trip through Parquet export and import.
1 parent c80b5d4 commit 0b64861

File tree

2 files changed

+76
-3
lines changed

2 files changed

+76
-3
lines changed

internal/parquet/types.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,18 @@ func (tm *TypeMapper) toList(value interface{}, listType *arrow.ListType) ([]int
611611
result[i] = f
612612
}
613613
return result, nil
614+
case string:
615+
// Handle vector string representation like "[0.12, 0.45, 0.78]"
616+
// This happens when gocql scans vector types into interface{}
617+
if strings.HasPrefix(v, "[") && strings.HasSuffix(v, "]") {
618+
// Parse the string as a list of floats
619+
result, err := tm.parseVectorString(v, listType)
620+
if err != nil {
621+
return nil, fmt.Errorf("failed to parse vector string: %v", err)
622+
}
623+
return result, nil
624+
}
625+
return nil, fmt.Errorf("cannot convert string %q to list", v)
614626
default:
615627
return nil, fmt.Errorf("cannot convert %T to list", value)
616628
}
@@ -643,6 +655,54 @@ func (tm *TypeMapper) toMap(value interface{}, mapType *arrow.MapType) (map[inte
643655
}
644656
}
645657

658+
// parseVectorString parses a vector string representation like "[0.12, 0.45, 0.78]" into a list
659+
func (tm *TypeMapper) parseVectorString(vectorStr string, listType *arrow.ListType) ([]interface{}, error) {
660+
// Remove brackets
661+
vectorStr = strings.TrimPrefix(vectorStr, "[")
662+
vectorStr = strings.TrimSuffix(vectorStr, "]")
663+
vectorStr = strings.TrimSpace(vectorStr)
664+
665+
if vectorStr == "" {
666+
return []interface{}{}, nil
667+
}
668+
669+
// Split by whitespace (Cassandra vectors use space-separated values)
670+
parts := strings.Fields(vectorStr)
671+
result := make([]interface{}, len(parts))
672+
673+
// Check the element type to determine parsing
674+
elemType := listType.Elem()
675+
switch elemType.ID() {
676+
case arrow.FLOAT32:
677+
for i, part := range parts {
678+
// Parse as float32
679+
var f float32
680+
_, err := fmt.Sscanf(part, "%f", &f)
681+
if err != nil {
682+
return nil, fmt.Errorf("failed to parse float at index %d: %v", i, err)
683+
}
684+
result[i] = f
685+
}
686+
case arrow.FLOAT64:
687+
for i, part := range parts {
688+
// Parse as float64
689+
var f float64
690+
_, err := fmt.Sscanf(part, "%f", &f)
691+
if err != nil {
692+
return nil, fmt.Errorf("failed to parse float at index %d: %v", i, err)
693+
}
694+
result[i] = f
695+
}
696+
default:
697+
// For other types, keep as strings
698+
for i, part := range parts {
699+
result[i] = part
700+
}
701+
}
702+
703+
return result, nil
704+
}
705+
646706
// Helper functions
647707

648708
func extractTypeParam(typeStr, prefix, suffix string) string {

internal/router/copy_from_parquet.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -405,14 +405,18 @@ func (h *MetaCommandHandler) formatJSONUDTString(value string) string {
405405
// formatListValue formats a list/array value
406406
func (h *MetaCommandHandler) formatListValue(v []interface{}, columnName string) string {
407407
if len(v) == 0 {
408-
return "[]"
408+
return h.getEmptyCollectionSyntax(columnName)
409409
}
410410

411411
quotedParts := make([]string, len(v))
412412
for i, item := range v {
413413
quotedParts[i] = h.formatListItem(item)
414414
}
415415

416+
// Use curly braces for sets, square brackets for lists
417+
if h.isSetColumn(columnName) {
418+
return "{" + strings.Join(quotedParts, ", ") + "}"
419+
}
416420
return "[" + strings.Join(quotedParts, ", ") + "]"
417421
}
418422

@@ -505,9 +509,18 @@ func (h *MetaCommandHandler) formatMapValue2(val interface{}) string {
505509

506510
// isSetColumn determines if a column is a set type based on naming conventions
507511
func (h *MetaCommandHandler) isSetColumn(columnName string) bool {
508-
return strings.Contains(columnName, "unique") ||
512+
// Check common naming patterns for sets
513+
namePatterns := strings.Contains(columnName, "unique") ||
509514
strings.HasSuffix(columnName, "_set") ||
510-
strings.HasSuffix(columnName, "_nums")
515+
strings.HasSuffix(columnName, "_nums") ||
516+
strings.ToLower(columnName) == "tags" // Common set column name
517+
518+
if namePatterns {
519+
return true
520+
}
521+
522+
// TODO: Query Cassandra schema to get actual type for more reliable detection
523+
return false
511524
}
512525

513526
// getEmptyCollectionSyntax returns the appropriate empty collection syntax

0 commit comments

Comments
 (0)