fix(parquet/pqarrow): decoding Parquet with Arrow dict in schema (apache#551)

freakyzoidberg · web-flow · commit 9b9fdd07fe2d · 2025-10-28T15:44:36.000-04:00
Fix spurious `parquet: column chunk cannot have more than one dictionary.` with specific parquet file Resolve apache#546 Parquet with * Arrow Dict column * Arrow Schema serialied in Parquet Metadata * ColumnChunks with 1 dict page + at least 2 Data page # Bug When maybeWriteNewDictionary() resets `newDictionary = false` at line 965, it causes the next call to readDictionary() to try to read the dictionary page again from the pager, which then calls configureDict() again, which throws the "cannot have more than one dictionary" error! The sequence is: 1. Read DICTIONARY_PAGE → newDictionary = true 2. Read DATA_PAGE_1 → calls maybeWriteNewDictionary() → resets newDictionary = false 3. Read DATA_PAGE_2 → calls readDictionary() → since newDictionary = false, tries to get dictionary page again → calls configureDict() → ERROR because decoder already exists # Fix Added DictionaryState enum (column_reader.go): - DictNotRead: Dictionary page hasn't been read yet - DictReadNotInserted: Dictionary page read and decoder configured, but not inserted into Arrow builder - DictFullyProcessed: Dictionary fully processed (read + inserted into builder)
diff --git a/parquet/file/column_reader.go b/parquet/file/column_reader.go
@@ -38,6 +38,19 @@ const (
 	defaultPageHeaderSize = 16 * 1024
 )
 
+// dictionaryState tracks the lifecycle of dictionary handling for a column chunk
+type dictionaryState int
+
+const (
+	// dictNotRead: Dictionary page has not been read yet
+	dictNotRead dictionaryState = iota
+	// dictReadNotInserted: Dictionary page has been read and decoder configured,
+	// but not yet inserted into Arrow builder (for Arrow Dictionary types only)
+	dictReadNotInserted
+	// dictFullyProcessed: Dictionary has been read, configured, and inserted into builder
+	dictFullyProcessed
+)
+
 // cloneByteArray is a helper function to clone a slice of byte slices
 func cloneByteArray[T ~[]byte](src []T) {
 	totalLength := 0
@@ -160,7 +173,7 @@ type columnChunkReader struct {
 	defLvlBuffer []int16
 	repLvlBuffer []int16
 
-	newDictionary bool
+	dictState dictionaryState
 }
 
 func newTypedColumnChunkReader(base columnChunkReader) ColumnChunkReader {
@@ -243,7 +256,7 @@ func (c *columnChunkReader) setPageReader(rdr PageReader) {
 	c.Close()
 	c.rdr, c.err = rdr, nil
 	c.decoders = make(map[format.Encoding]encoding.TypedDecoder)
-	c.newDictionary = false
+	c.dictState = dictNotRead
 	c.numBuffered, c.numDecoded = 0, 0
 }
 
@@ -286,7 +299,8 @@ func (c *columnChunkReader) HasNext() bool {
 }
 
 func (c *columnChunkReader) readDictionary() error {
-	if c.newDictionary {
+	// If dictionary has been read (in any state beyond dictNotRead), skip reading
+	if c.dictState != dictNotRead {
 		return nil
 	}
 
@@ -324,7 +338,10 @@ func (c *columnChunkReader) configureDict(page *DictionaryPage) error {
 		return xerrors.New("parquet: dictionary index must be plain encoding")
 	}
 
-	c.newDictionary = true
+	// Dictionary page has been read and decoder configured
+	// For non-Arrow Dictionary types, this is the final state
+	// For Arrow Dictionary types, record reader will advance to dictFullyProcessed
+	c.dictState = dictReadNotInserted
 	c.curDecoder = c.decoders[enc]
 	return nil
 }
diff --git a/parquet/file/record_reader.go b/parquet/file/record_reader.go
@@ -953,16 +953,17 @@ func (bd *byteArrayDictRecordReader) flushBuilder() {
 
 func (bd *byteArrayDictRecordReader) maybeWriteNewDictionary() error {
 	rdr := bd.ColumnChunkReader.(*ByteArrayColumnChunkReader)
-	if rdr.newDictionary {
-		// if there is a new dictionary, we may need to flush the builder,
-		// then insert the new dictionary values
+	// Only process dictionary if it's been read but not yet inserted into builder
+	if rdr.dictState == dictReadNotInserted {
+		// Flush the builder and insert the new dictionary values
 		bd.flushBuilder()
 		bd.bldr.(*array.BinaryDictionaryBuilder).ResetFull()
 		dec := rdr.curDecoder.(*encoding.DictByteArrayDecoder)
 		if err := dec.InsertDictionary(bd.bldr); err != nil {
 			return err
 		}
-		rdr.newDictionary = false
+		// Mark dictionary as fully processed (read + inserted)
+		rdr.dictState = dictFullyProcessed
 	}
 	return nil
 }
diff --git a/parquet/pqarrow/dictionary_multipage_test.go b/parquet/pqarrow/dictionary_multipage_test.go
@@ -0,0 +1,147 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pqarrow_test
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"testing"
+
+	"github.com/apache/arrow-go/v18/arrow"
+	"github.com/apache/arrow-go/v18/arrow/array"
+	"github.com/apache/arrow-go/v18/arrow/memory"
+	"github.com/apache/arrow-go/v18/parquet"
+	"github.com/apache/arrow-go/v18/parquet/compress"
+	"github.com/apache/arrow-go/v18/parquet/file"
+	"github.com/apache/arrow-go/v18/parquet/pqarrow"
+	"github.com/stretchr/testify/require"
+)
+
+// TestArrowDictionaryTypeMultiplePages tests reading Arrow Dictionary types
+// with multiple data pages in a single row group.
+//
+// This test exercises byteArrayDictRecordReader which has a bug at line 966
+// in maybeWriteNewDictionary() that resets newDictionary=false.
+//
+// The bug manifests when:
+// 1. Arrow schema has Dictionary type (not just parquet dictionary encoding)
+// 2. Multiple data pages exist in a row group
+// 3. Reading with large batch size that spans multiple pages
+func TestArrowDictionaryTypeMultiplePages(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+
+	// Schema with Arrow Dictionary type
+	schema := arrow.NewSchema(
+		[]arrow.Field{
+			{
+				Name: "dict_col",
+				Type: &arrow.DictionaryType{
+					IndexType: arrow.PrimitiveTypes.Uint32,
+					ValueType: arrow.BinaryTypes.String,
+				},
+				Nullable: false,
+			},
+		},
+		nil,
+	)
+
+	var buf bytes.Buffer
+
+	// CRITICAL: Use WithStoreSchema() to preserve Arrow Dictionary type metadata
+	// Without this, arrow-go converts Dictionary type to plain string
+	props := parquet.NewWriterProperties(
+		parquet.WithDictionaryDefault(true),
+		parquet.WithCompression(compress.Codecs.Snappy),
+		parquet.WithDataPageSize(10), // Small page size to force multiple pages
+		parquet.WithMaxRowGroupLength(100000),
+		parquet.WithAllocator(mem),
+	)
+
+	writerProps := pqarrow.NewArrowWriterProperties(
+		pqarrow.WithStoreSchema(), // KEY: Preserve Arrow Dictionary type
+		pqarrow.WithAllocator(mem),
+	)
+
+	writer, err := pqarrow.NewFileWriter(schema, &buf, props, writerProps)
+	require.NoError(t, err)
+
+	// Create dictionary array with many values to span multiple pages
+	dictBuilder := array.NewDictionaryBuilder(mem, &arrow.DictionaryType{
+		IndexType: arrow.PrimitiveTypes.Uint32,
+		ValueType: arrow.BinaryTypes.String,
+	}).(*array.BinaryDictionaryBuilder)
+	defer dictBuilder.Release()
+
+	// Create data with few unique values (good for dictionary)
+	values := []string{"ValueA", "ValueB", "ValueC", "ValueD"}
+	numRows := 2000
+
+	for i := 0; i < numRows; i++ {
+		require.NoError(t, dictBuilder.AppendString(values[i%len(values)]))
+	}
+
+	dictArray := dictBuilder.NewDictionaryArray()
+	defer dictArray.Release()
+
+	rec := array.NewRecordBatch(schema, []arrow.Array{dictArray}, int64(numRows))
+	defer rec.Release()
+
+	err = writer.Write(rec)
+	require.NoError(t, err)
+
+	err = writer.Close()
+	require.NoError(t, err)
+
+	t.Logf("Written %d bytes", buf.Len())
+
+	// Read back
+	pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()),
+		file.WithReadProps(parquet.NewReaderProperties(mem)))
+	require.NoError(t, err)
+	defer pf.Close()
+
+	t.Logf("File has %d row groups", pf.NumRowGroups())
+
+	reader, err := pqarrow.NewFileReader(pf,
+		pqarrow.ArrowReadProperties{BatchSize: pf.NumRows()}, mem)
+	require.NoError(t, err)
+
+	rr, err := reader.GetRecordReader(context.Background(), nil, nil)
+	require.NoError(t, err)
+	defer rr.Release()
+
+	// Read all data - this should trigger the bug if present
+	totalRows := int64(0)
+	for {
+		rec, err := rr.Read()
+		if err == io.EOF {
+			break
+		}
+
+		// This will fail with "parquet: column chunk cannot have more than one dictionary"
+		// if the bug is present
+		require.NoError(t, err, "Failed to read Arrow Dictionary type with multiple pages")
+
+		totalRows += rec.NumRows()
+		// Note: Don't call rec.Release() here - the record reader manages record lifecycle
+	}
+
+	require.Equal(t, int64(numRows), totalRows, "Should read all rows")
+	t.Logf("Successfully read %d rows", totalRows)
+}