Skip to content

Commit 27e03ce

Browse files
authored
snowflake: use new parquet-go parallelized row group construction (#3940)
1 parent 7be41c6 commit 27e03ce

File tree

14 files changed

+1411
-155
lines changed

14 files changed

+1411
-155
lines changed

go.mod

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ require (
127127
github.com/opensearch-project/opensearch-go/v3 v3.1.0
128128
github.com/ory/dockertest/v3 v3.12.0
129129
github.com/oschwald/geoip2-golang v1.13.0
130-
github.com/parquet-go/parquet-go v0.25.1
130+
github.com/parquet-go/parquet-go v0.27.0
131131
github.com/pebbe/zmq4 v1.4.0
132132
github.com/pinecone-io/go-pinecone v1.1.1
133133
github.com/pkg/sftp v1.13.9
@@ -264,6 +264,8 @@ require (
264264
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
265265
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
266266
github.com/onsi/gomega v1.37.0 // indirect
267+
github.com/parquet-go/bitpack v1.0.0 // indirect
268+
github.com/parquet-go/jsonlite v1.0.0 // indirect
267269
github.com/pierrec/lz4 v2.6.1+incompatible // indirect
268270
github.com/pingcap/errors v0.11.5-0.20250318082626-8f80e5cb09ec // indirect
269271
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86 // indirect
@@ -282,6 +284,7 @@ require (
282284
github.com/tidwall/match v1.1.1 // indirect
283285
github.com/tidwall/pretty v1.2.1 // indirect
284286
github.com/timandy/routine v1.1.5 // indirect
287+
github.com/twpayne/go-geom v1.6.1 // indirect
285288
github.com/x448/float16 v0.8.4 // indirect
286289
github.com/xanzy/ssh-agent v0.3.3 // indirect
287290
github.com/yosida95/uritemplate/v3 v3.0.2 // indirect
@@ -490,7 +493,6 @@ require (
490493
github.com/robfig/cron/v3 v3.0.1 // indirect
491494
github.com/russross/blackfriday/v2 v2.1.0 // indirect
492495
github.com/segmentio/asm v1.2.1 // indirect
493-
github.com/segmentio/encoding v0.5.3
494496
github.com/segmentio/ksuid v1.0.4 // indirect
495497
github.com/shopspring/decimal v1.4.0 // indirect
496498
github.com/sirupsen/logrus v1.9.3 // indirect

go.sum

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -770,6 +770,8 @@ github.com/ClickHouse/ch-go v0.68.0 h1:zd2VD8l2aVYnXFRyhTyKCrxvhSz1AaY4wBUXu/f0G
770770
github.com/ClickHouse/ch-go v0.68.0/go.mod h1:C89Fsm7oyck9hr6rRo5gqqiVtaIY6AjdD0WFMyNRQ5s=
771771
github.com/ClickHouse/clickhouse-go/v2 v2.40.3 h1:46jB4kKwVDUOnECpStKMVXxvR0Cg9zeV9vdbPjtn6po=
772772
github.com/ClickHouse/clickhouse-go/v2 v2.40.3/go.mod h1:qO0HwvjCnTB4BPL/k6EE3l4d9f/uF+aoimAhJX70eKA=
773+
github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
774+
github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
773775
github.com/DataDog/datadog-go v2.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
774776
github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
775777
github.com/DataDog/zstd v1.5.7 h1:ybO8RBeh29qrxIhCA9E8gKY6xfONU9T6G6aP9DTKfLE=
@@ -834,6 +836,10 @@ github.com/ajstarks/deck v0.0.0-20200831202436-30c9fc6549a9/go.mod h1:JynElWSGnm
834836
github.com/ajstarks/deck/generate v0.0.0-20210309230005-c3f852c02e19/go.mod h1:T13YZdzov6OU0A1+RfKZiZN9ca6VeKdBdyDV+BY97Tk=
835837
github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
836838
github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b/go.mod h1:1KcenG0jGWcpt8ov532z81sp/kMMUG485J2InIOyADM=
839+
github.com/alecthomas/assert/v2 v2.10.0 h1:jjRCHsj6hBJhkmhznrCzoNpbA3zqy0fYiUcYZP/GkPY=
840+
github.com/alecthomas/assert/v2 v2.10.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k=
841+
github.com/alecthomas/repr v0.4.0 h1:GhI2A8MACjfegCPVq9f1FLvIBS+DrQ2KQBFZP1iFzXc=
842+
github.com/alecthomas/repr v0.4.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4=
837843
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
838844
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
839845
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
@@ -1915,8 +1921,12 @@ github.com/oschwald/geoip2-golang v1.13.0 h1:Q44/Ldc703pasJeP5V9+aFSZFmBN7DKHbNs
19151921
github.com/oschwald/geoip2-golang v1.13.0/go.mod h1:P9zG+54KPEFOliZ29i7SeYZ/GM6tfEL+rgSn03hYuUo=
19161922
github.com/oschwald/maxminddb-golang v1.13.1 h1:G3wwjdN9JmIK2o/ermkHM+98oX5fS+k5MbwsmL4MRQE=
19171923
github.com/oschwald/maxminddb-golang v1.13.1/go.mod h1:K4pgV9N/GcK694KSTmVSDTODk4IsCNThNdTmnaBZ/F8=
1918-
github.com/parquet-go/parquet-go v0.25.1 h1:l7jJwNM0xrk0cnIIptWMtnSnuxRkwq53S+Po3KG8Xgo=
1919-
github.com/parquet-go/parquet-go v0.25.1/go.mod h1:AXBuotO1XiBtcqJb/FKFyjBG4aqa3aQAAWF3ZPzCanY=
1924+
github.com/parquet-go/bitpack v1.0.0 h1:AUqzlKzPPXf2bCdjfj4sTeacrUwsT7NlcYDMUQxPcQA=
1925+
github.com/parquet-go/bitpack v1.0.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs=
1926+
github.com/parquet-go/jsonlite v1.0.0 h1:87QNdi56wOfsE5bdgas0vRzHPxfJgzrXGml1zZdd7VU=
1927+
github.com/parquet-go/jsonlite v1.0.0/go.mod h1:nDjpkpL4EOtqs6NQugUsi0Rleq9sW/OtC1NnZEnxzF0=
1928+
github.com/parquet-go/parquet-go v0.27.0 h1:vHWK2xaHbj+v1DYps03yDRpEsdtOeKbhiXUaixoPb3g=
1929+
github.com/parquet-go/parquet-go v0.27.0/go.mod h1:navtkAYr2LGoJVp141oXPlO/sxLvaOe3la2JEoD8+rg=
19201930
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
19211931
github.com/paulmach/orb v0.12.0 h1:z+zOwjmG3MyEEqzv92UN49Lg1JFYx0L9GpGKNVDKk1s=
19221932
github.com/paulmach/orb v0.12.0/go.mod h1:5mULz1xQfs3bmQm63QEJA6lNGujuRafwA5S/EnuLaLU=
@@ -2071,8 +2081,6 @@ github.com/sashabaranov/go-openai v1.41.2/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adO
20712081
github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
20722082
github.com/segmentio/asm v1.2.1 h1:DTNbBqs57ioxAD4PrArqftgypG4/qNpXoJx8TVXxPR0=
20732083
github.com/segmentio/asm v1.2.1/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs=
2074-
github.com/segmentio/encoding v0.5.3 h1:OjMgICtcSFuNvQCdwqMCv9Tg7lEOXGwm1J5RPQccx6w=
2075-
github.com/segmentio/encoding v0.5.3/go.mod h1:HS1ZKa3kSN32ZHVZ7ZLPLXWvOVIiZtyJnO1gPH1sKt0=
20762084
github.com/segmentio/ksuid v1.0.4 h1:sBo2BdShXjmcugAMwjugoGUdUV0pcxY5mW4xKRn3v4c=
20772085
github.com/segmentio/ksuid v1.0.4/go.mod h1:/XUiZBD3kVx5SmUOl55voK5yeAbBNNIed+2O73XgrPE=
20782086
github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw=
@@ -2196,6 +2204,8 @@ github.com/twmb/franz-go/pkg/sr v1.6.0 h1:YcnD65hmdEuJljSM4O9Hldr/0oi+vrjPGHaRUu
21962204
github.com/twmb/franz-go/pkg/sr v1.6.0/go.mod h1:64CsHlsQnyFRq1sYPcCmlRrEG3PlLPb6cDddx2wGr28=
21972205
github.com/twmb/go-cache v1.2.1 h1:yUkLutow4S2x5NMbqFW24o14OsucoFI5Fzmlb6uBinM=
21982206
github.com/twmb/go-cache v1.2.1/go.mod h1:lArg9KhCl+GTFMikitLGhIBh/i11OK0lhSveqlMbbrY=
2207+
github.com/twpayne/go-geom v1.6.1 h1:iLE+Opv0Ihm/ABIcvQFGIiFBXd76oBIar9drAwHFhR4=
2208+
github.com/twpayne/go-geom v1.6.1/go.mod h1:Kr+Nly6BswFsKM5sd31YaoWS5PeDDH2NftJTK7Gd028=
21992209
github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw=
22002210
github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
22012211
github.com/uptrace/bun v1.1.12 h1:sOjDVHxNTuM6dNGaba0wUuz7KvDE1BmNu9Gqs2gJSXQ=

internal/impl/parquet/processor_encode_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,10 @@ schema:
4444

4545
tctx := t.Context()
4646
_, err = encodeProc.ProcessBatch(tctx, service.MessageBatch{
47-
service.NewMessage([]byte(`{"id":12,"name":"foo"}`)),
47+
service.NewMessage([]byte(`{"id":"bar","name":"foo"}`)),
4848
})
4949
require.Error(t, err)
50-
assert.Contains(t, err.Error(), "cannot create parquet value of type FLOAT from go value of type int64")
50+
assert.Contains(t, err.Error(), "encoding panic")
5151
}
5252

5353
func TestParquetEncodeDecodeRoundTrip(t *testing.T) {

internal/impl/snowflake/streaming/integration_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ func setup(t *testing.T) (*streaming.SnowflakeRestClient, *streaming.SnowflakeSe
6767
clientOptions := streaming.ClientOptions{
6868
Account: envOr("SNOWFLAKE_ACCOUNT", "wqkfxqq-redpanda_aws"),
6969
URL: fmt.Sprintf("https://%s.snowflakecomputing.com", envOr("SNOWFLAKE_ACCOUNT", "wqkfxqq-redpanda_aws")),
70-
User: envOr("SNOWFLAKE_USER", "TYLERTYLER_DB"),
70+
User: envOr("SNOWFLAKE_USER", "TYLERROCKWOOD"),
7171
Role: "ACCOUNTADMIN",
7272
PrivateKey: parseResult.(*rsa.PrivateKey),
7373
ConnectVersion: "",

internal/impl/snowflake/streaming/parquet.go

Lines changed: 58 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,11 @@ package streaming
1212

1313
import (
1414
"bytes"
15-
"encoding/binary"
1615
"errors"
1716
"fmt"
1817

1918
"github.com/parquet-go/parquet-go"
2019
"github.com/parquet-go/parquet-go/format"
21-
"github.com/segmentio/encoding/thrift"
2220

2321
"github.com/redpanda-data/benthos/v4/public/service"
2422
)
@@ -66,41 +64,40 @@ func objectMessageToRow(msg *service.Message, out []any, nameToPosition map[stri
6664
return nil
6765
}
6866

69-
func constructRowGroupFromObject(
67+
// writeRowGroupFromObject writes a batch of object messages directly to a concurrent row group's column writers,
68+
// then flushes (compresses) the row group. Values are written directly to the column writers as they are converted.
69+
func writeRowGroupFromObject(
7070
batch service.MessageBatch,
7171
schema *parquet.Schema,
7272
transformers []*dataTransformer,
7373
mode SchemaMode,
74-
) ([]parquet.Row, []*statsBuffer, error) {
75-
// We write all of our data in a columnar fashion, but need to pivot that data so that we can feed it into
76-
// out parquet library (which sadly will redo the pivot - maybe we need a lower level abstraction...).
77-
// So create a massive matrix that we will write stuff in columnar form, but then we don't need to move any
78-
// data to create rows of the data via an in-place transpose operation.
79-
//
80-
// TODO: Consider caching/pooling this matrix as I expect many are similarily sized.
74+
rg *parquet.ConcurrentRowGroupWriter,
75+
) ([]*statsBuffer, error) {
8176
rowWidth := len(schema.Fields())
82-
matrix := make([]parquet.Value, len(batch)*rowWidth)
8377
nameToPosition := make(map[string]int, rowWidth)
8478
stats := make([]*statsBuffer, rowWidth)
8579
buffers := make([]typedBuffer, rowWidth)
80+
columnWriters := rg.ColumnWriters()
81+
8682
for idx, t := range transformers {
8783
leaf, ok := schema.Lookup(t.name)
8884
if !ok {
89-
return nil, nil, fmt.Errorf("invariant failed: unable to find column %q", t.name)
85+
return nil, fmt.Errorf("invariant failed: unable to find column %q", t.name)
9086
}
9187
buffers[idx] = t.bufferFactory()
92-
buffers[idx].Prepare(matrix, leaf.ColumnIndex, rowWidth)
88+
buffers[idx].Reset(columnWriters[leaf.ColumnIndex], leaf.ColumnIndex)
9389
stats[idx] = &statsBuffer{}
9490
nameToPosition[t.name] = idx
9591
}
96-
// First we need to shred our record into columns, snowflake's data model
97-
// is thankfully a flat list of columns, so no dremel style record shredding
98-
// is needed
92+
93+
// Shred records into columns - snowflake's data model is a flat list of columns,
94+
// so no dremel style record shredding is needed. Values are written directly
95+
// to column writers as they are converted.
9996
row := make([]any, rowWidth)
10097
for _, msg := range batch {
10198
err := objectMessageToRow(msg, row, nameToPosition, mode)
10299
if err != nil {
103-
return nil, nil, err
100+
return nil, err
104101
}
105102
for i, v := range row {
106103
t := transformers[i]
@@ -109,24 +106,21 @@ func constructRowGroupFromObject(
109106
err = t.converter.ValidateAndConvert(s, v, b)
110107
if err != nil {
111108
if errors.Is(err, errNullValue) {
112-
return nil, nil, &NonNullColumnError{msg, t.column.Name}
109+
return nil, &NonNullColumnError{msg, t.column.Name}
113110
}
114-
// There is not special typed error for a validation error, there really isn't
115-
// anything we can do about it.
116-
return nil, nil, fmt.Errorf("invalid data for column %s: %w", t.name, err)
111+
return nil, fmt.Errorf("invalid data for column %s: %w", t.name, err)
117112
}
118113
// reset the column as nil for the next row
119114
row[i] = nil
120115
}
121116
}
122-
// Now all our values have been written to each buffer - here is where we do our matrix
123-
// transpose mentioned above
124-
rows := make([]parquet.Row, len(batch))
125-
for i := range rows {
126-
rowStart := i * rowWidth
127-
rows[i] = matrix[rowStart : rowStart+rowWidth]
117+
118+
// Flush compresses the row group data
119+
if err := rg.Flush(); err != nil {
120+
return nil, fmt.Errorf("failed to flush row group: %w", err)
128121
}
129-
return rows, stats, nil
122+
123+
return stats, nil
130124
}
131125

132126
// arrayMessageToRow converts a message into columnar form using the provided name to index mapping.
@@ -161,31 +155,35 @@ func arrayMessageToRow(msg *service.Message, out []any, mode SchemaMode) error {
161155
return nil
162156
}
163157

164-
func constructRowGroupFromArray(
158+
// writeRowGroupFromArray writes a batch of array messages directly to a concurrent row group's column writers,
159+
// then flushes (compresses) the row group. Values are written directly to the column writers as they are converted.
160+
func writeRowGroupFromArray(
165161
batch service.MessageBatch,
166162
schema *parquet.Schema,
167163
transformers []*dataTransformer,
168164
mode SchemaMode,
169-
) ([]parquet.Row, []*statsBuffer, error) {
170-
// TODO: Switch to using concurrent row groups to write this stuff
165+
rg *parquet.ConcurrentRowGroupWriter,
166+
) ([]*statsBuffer, error) {
171167
rowWidth := len(schema.Fields())
172-
matrix := make([]parquet.Value, len(batch)*rowWidth)
173168
stats := make([]*statsBuffer, rowWidth)
174169
buffers := make([]typedBuffer, rowWidth)
170+
columnWriters := rg.ColumnWriters()
171+
175172
for idx, t := range transformers {
176173
leaf, ok := schema.Lookup(t.name)
177174
if !ok {
178-
return nil, nil, fmt.Errorf("invariant failed: unable to find column %q", t.name)
175+
return nil, fmt.Errorf("invariant failed: unable to find column %q", t.name)
179176
}
180177
buffers[idx] = t.bufferFactory()
181-
buffers[idx].Prepare(matrix, leaf.ColumnIndex, rowWidth)
178+
buffers[idx].Reset(columnWriters[leaf.ColumnIndex], leaf.ColumnIndex)
182179
stats[idx] = &statsBuffer{}
183180
}
181+
184182
row := make([]any, rowWidth)
185183
for _, msg := range batch {
186184
err := arrayMessageToRow(msg, row, mode)
187185
if err != nil {
188-
return nil, nil, err
186+
return nil, err
189187
}
190188
for i, v := range row {
191189
t := transformers[i]
@@ -194,29 +192,27 @@ func constructRowGroupFromArray(
194192
err = t.converter.ValidateAndConvert(s, v, b)
195193
if err != nil {
196194
if errors.Is(err, errNullValue) {
197-
return nil, nil, &NonNullColumnError{msg, t.column.Name}
195+
return nil, &NonNullColumnError{msg, t.column.Name}
198196
}
199-
// There is not special typed error for a validation error, there really isn't
200-
// anything we can do about it.
201-
return nil, nil, fmt.Errorf("invalid data for column %s: %w", t.name, err)
197+
return nil, fmt.Errorf("invalid data for column %s: %w", t.name, err)
202198
}
203199
// reset the column as nil for the next row
204200
row[i] = nil
205201
}
206202
}
207-
// Now all our values have been written to each buffer - here is where we do our matrix
208-
// transpose mentioned above
209-
rows := make([]parquet.Row, len(batch))
210-
for i := range rows {
211-
rowStart := i * rowWidth
212-
rows[i] = matrix[rowStart : rowStart+rowWidth]
203+
204+
// Flush compresses the row group data
205+
if err := rg.Flush(); err != nil {
206+
return nil, fmt.Errorf("failed to flush row group: %w", err)
213207
}
214-
return rows, stats, nil
208+
209+
return stats, nil
215210
}
216211

217212
type parquetWriter struct {
218-
b *bytes.Buffer
219-
w *parquet.GenericWriter[any]
213+
b *bytes.Buffer
214+
w *parquet.GenericWriter[any]
215+
schema *parquet.Schema
220216
}
221217

222218
func newParquetWriter(rpcnVersion string, schema *parquet.Schema) *parquetWriter {
@@ -230,53 +226,32 @@ func newParquetWriter(rpcnVersion string, schema *parquet.Schema) *parquetWriter
230226
parquet.Compression(&parquet.Zstd),
231227
parquet.WriteBufferSize(0),
232228
)
233-
return &parquetWriter{b, w}
229+
return &parquetWriter{b, w, schema}
234230
}
235231

236-
// WriteFile writes a new parquet file using the rows and metadata.
237-
//
238-
// NOTE: metadata is sticky - if you want the next file to remove metadata you need to set the value to the empty string
239-
// to actually remove it. In the usage of this method in this package, the metadata keys are all always the same.
240-
func (w *parquetWriter) WriteFile(rows []parquet.Row, metadata map[string]string) (out []byte, err error) {
232+
// BeginRowGroup creates a new concurrent row group for parallel construction.
233+
func (w *parquetWriter) BeginRowGroup() *parquet.ConcurrentRowGroupWriter {
234+
return w.w.BeginRowGroup()
235+
}
236+
237+
// Reset prepares the writer for a new file with the given metadata.
238+
func (w *parquetWriter) Reset(metadata map[string]string) {
241239
for k, v := range metadata {
242240
w.w.SetKeyValueMetadata(k, v)
243241
}
244242
w.b.Reset()
245243
w.w.Reset(w.b)
246-
defer func() {
247-
if r := recover(); r != nil {
248-
err = fmt.Errorf("encoding panic: %v", r)
249-
}
250-
}()
251-
_, err = w.w.WriteRows(rows)
252-
if err != nil {
253-
return
254-
}
255-
err = w.w.Close()
256-
out = w.b.Bytes()
257-
return
258244
}
259245

260-
func readParquetMetadata(parquetFile []byte) (metadata format.FileMetaData, err error) {
261-
if len(parquetFile) < 8 {
262-
return format.FileMetaData{}, fmt.Errorf("too small of parquet file: %d", len(parquetFile))
263-
}
264-
trailingBytes := parquetFile[len(parquetFile)-8:]
265-
if string(trailingBytes[4:]) != "PAR1" {
266-
return metadata, fmt.Errorf("missing magic bytes, got: %q", trailingBytes[4:])
267-
}
268-
footerSize := int(binary.LittleEndian.Uint32(trailingBytes))
269-
if len(parquetFile) < footerSize+8 {
270-
return metadata, fmt.Errorf("too small of parquet file: %d, footer size: %d", len(parquetFile), footerSize)
271-
}
272-
footerBytes := parquetFile[len(parquetFile)-(footerSize+8) : len(parquetFile)-8]
273-
if err := thrift.Unmarshal(new(thrift.CompactProtocol), footerBytes, &metadata); err != nil {
274-
return metadata, fmt.Errorf("unable to extract parquet metadata: %w", err)
246+
// Close finalizes the parquet file and returns the bytes.
247+
func (w *parquetWriter) Close() ([]byte, *format.FileMetaData, error) {
248+
if err := w.w.Close(); err != nil {
249+
return nil, nil, err
275250
}
276-
return
251+
return w.b.Bytes(), w.w.File().Metadata(), nil
277252
}
278253

279-
func totalUncompressedSize(metadata format.FileMetaData) int32 {
254+
func totalUncompressedSize(metadata *format.FileMetaData) int32 {
280255
var size int64
281256
for _, rowGroup := range metadata.RowGroups {
282257
size += rowGroup.TotalByteSize

0 commit comments

Comments
 (0)