Skip to content

Commit 3f18fc6

Browse files
committed
🐛 Fix issue parquet quasi-binary import #52
Before, when a column had the BYTE_ARRAY type, anyquery would still try to interpret it at UTF-8, leading to a Replacement Character. To fix this, Anyquery now make sure a []byte slice is returned rather than a string. I couldn't find any other way to fix this as parquet-go/parquet-go returns a string even for a parquet BYTE_ARRAY
1 parent 177d5e2 commit 3f18fc6

File tree

3 files changed

+62
-10
lines changed

3 files changed

+62
-10
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ require (
4242
github.com/mark3labs/mcp-go v0.31.0
4343
github.com/mattn/go-sqlite3 v1.14.28
4444
github.com/olekukonko/tablewriter v0.0.5
45-
github.com/parquet-go/parquet-go v0.25.0
45+
github.com/parquet-go/parquet-go v0.25.1
4646
github.com/runreveal/pql v0.2.0
4747
github.com/samber/lo v1.50.0
4848
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1231,8 +1231,8 @@ github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+
12311231
github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc=
12321232
github.com/outcaste-io/ristretto v0.2.3 h1:AK4zt/fJ76kjlYObOeNwh4T3asEuaCmp26pOvUOL9w0=
12331233
github.com/outcaste-io/ristretto v0.2.3/go.mod h1:W8HywhmtlopSB1jeMg3JtdIhf+DYkLAr0VN/s4+MHac=
1234-
github.com/parquet-go/parquet-go v0.25.0 h1:GwKy11MuF+al/lV6nUsFw8w8HCiPOSAx1/y8yFxjH5c=
1235-
github.com/parquet-go/parquet-go v0.25.0/go.mod h1:OqBBRGBl7+llplCvDMql8dEKaDqjaFA/VAPw+OJiNiw=
1234+
github.com/parquet-go/parquet-go v0.25.1 h1:l7jJwNM0xrk0cnIIptWMtnSnuxRkwq53S+Po3KG8Xgo=
1235+
github.com/parquet-go/parquet-go v0.25.1/go.mod h1:AXBuotO1XiBtcqJb/FKFyjBG4aqa3aQAAWF3ZPzCanY=
12361236
github.com/paulmach/orb v0.11.1 h1:3koVegMC4X/WeiXYz9iswopaTwMem53NzTJuTF20JzU=
12371237
github.com/paulmach/orb v0.11.1/go.mod h1:5mULz1xQfs3bmQm63QEJA6lNGujuRafwA5S/EnuLaLU=
12381238
github.com/paulmach/protoscan v0.2.1/go.mod h1:SpcSwydNLrxUGSDvXvO0P7g7AuhJ7lcKfDlhJCDw2gY=

module/read_parquet.go

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@ type ParquetModule struct {
1818

1919
type ParquetTable struct {
2020
mmap mmap.MMap
21-
column map[int]string
21+
column map[int]parquetColumn
2222
}
2323

2424
type ParquetCursor struct {
25-
column map[int]string
25+
column map[int]parquetColumn
2626
reader *parquet.GenericReader[any]
2727
rowBuffer *deque.Deque[map[string]interface{}]
2828
rowID int64
@@ -34,6 +34,12 @@ type ParquetCursor struct {
3434
noMoreRows bool
3535
}
3636

37+
type parquetColumn struct {
38+
Name string
39+
Type string
40+
SubFields map[string]parquetColumn
41+
}
42+
3743
const rowToRequestPerBatch = 16
3844

3945
func (m *ParquetModule) Create(c *sqlite3.SQLiteConn, args []string) (sqlite3.VTab, error) {
@@ -69,7 +75,7 @@ func (m *ParquetModule) Connect(c *sqlite3.SQLiteConn, args []string) (sqlite3.V
6975
}
7076

7177
// Open the file
72-
mmap := mmap.MMap{}
78+
var mmap mmap.MMap
7379
var err error
7480

7581
mmap, err = openMmapedFile(fileName)
@@ -82,7 +88,7 @@ func (m *ParquetModule) Connect(c *sqlite3.SQLiteConn, args []string) (sqlite3.V
8288
// Read the parquet file
8389
reader := parquet.NewGenericReader[any](byteReader)
8490

85-
column := make(map[int]string)
91+
column := make(map[int]parquetColumn)
8692

8793
sqlSchema := strings.Builder{}
8894
sqlSchema.WriteString("CREATE TABLE parquet (")
@@ -106,9 +112,28 @@ func (m *ParquetModule) Connect(c *sqlite3.SQLiteConn, args []string) (sqlite3.V
106112
default:
107113
sqlSchema.WriteString("TEXT")
108114
}
115+
109116
// Save the column name
110-
column[i] = field.Name()
117+
col := parquetColumn{
118+
Name: field.Name(),
119+
Type: field.Type().String(),
120+
}
121+
122+
// Get subfields if the field is a group
123+
if field.Type().String() == "group" {
124+
col.SubFields = make(map[string]parquetColumn)
125+
for _, subField := range field.Fields() {
126+
col.SubFields[subField.Name()] = parquetColumn{
127+
Name: subField.Name(),
128+
Type: subField.Type().String(),
129+
}
130+
}
131+
}
132+
133+
// Save the column in the map
134+
column[i] = col
111135
}
136+
112137
sqlSchema.WriteString(");")
113138
c.DeclareVTab(sqlSchema.String())
114139

@@ -118,6 +143,7 @@ func (m *ParquetModule) Connect(c *sqlite3.SQLiteConn, args []string) (sqlite3.V
118143
func (t *ParquetTable) Open() (sqlite3.VTabCursor, error) {
119144
// Create a new reader
120145
reader := parquet.NewGenericReader[any](bytes.NewReader(t.mmap))
146+
121147
return &ParquetCursor{
122148
column: t.column,
123149
reader: reader,
@@ -195,7 +221,7 @@ func (t *ParquetCursor) Column(context *sqlite3.SQLiteContext, col int) error {
195221
context.ResultNull()
196222
return nil
197223
}
198-
val, ok := t.rowBuffer.Front()[colName]
224+
val, ok := t.rowBuffer.Front()[colName.Name]
199225
if !ok {
200226
context.ResultNull()
201227
return nil
@@ -225,10 +251,36 @@ func (t *ParquetCursor) Column(context *sqlite3.SQLiteContext, col int) error {
225251
case float64:
226252
context.ResultDouble(valParsed)
227253
case string:
228-
context.ResultText(valParsed)
254+
// parquet-go returns BYTE_ARRAY as string resulting in UTF-8 issues
255+
// When we detect a column that has a BYTE_ARRAY type, we will convert it to a byte slice
256+
if colName.Type == "BYTE_ARRAY" || colName.Type == "FIXED_LEN_BYTE_ARRAY" {
257+
// Convert the string to a byte slice
258+
context.ResultBlob([]byte(valParsed))
259+
} else {
260+
context.ResultText(valParsed)
261+
}
229262
case []byte:
230263
context.ResultBlob(valParsed)
231264
case map[string]interface{}:
265+
for key, value := range valParsed {
266+
267+
// Get the subfield type if it exists
268+
subFieldType, ok := colName.SubFields[key]
269+
if !ok {
270+
continue
271+
}
272+
273+
// Same as the string case, we need to handle BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY
274+
if subFieldType.Type == "BYTE_ARRAY" || subFieldType.Type == "FIXED_LEN_BYTE_ARRAY" {
275+
if strValue, ok := value.(string); ok {
276+
valParsed[key] = []byte(strValue)
277+
} else if byteValue, ok := value.([]byte); ok {
278+
valParsed[key] = byteValue
279+
} else {
280+
valParsed[key] = value
281+
}
282+
}
283+
}
232284
marshaled, err := json.Marshal(valParsed)
233285
if err != nil {
234286
context.ResultNull()

0 commit comments

Comments
 (0)