Skip to content

Commit 735d3c1

Browse files
Use dictionary decoding producer to handle dictionary fields that are not valid avro enums
1 parent f4134d1 commit 735d3c1

File tree

3 files changed

+58
-9
lines changed

3 files changed

+58
-9
lines changed

adapter/avro/src/main/java/org/apache/arrow/adapter/avro/ArrowToAvroUtils.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.apache.arrow.adapter.avro.producers.AvroBigIntProducer;
2525
import org.apache.arrow.adapter.avro.producers.AvroBooleanProducer;
2626
import org.apache.arrow.adapter.avro.producers.AvroBytesProducer;
27+
import org.apache.arrow.adapter.avro.producers.AvroEnumProducer;
2728
import org.apache.arrow.adapter.avro.producers.AvroFixedSizeBinaryProducer;
2829
import org.apache.arrow.adapter.avro.producers.AvroFixedSizeListProducer;
2930
import org.apache.arrow.adapter.avro.producers.AvroFloat2Producer;
@@ -44,6 +45,7 @@
4445
import org.apache.arrow.adapter.avro.producers.AvroUint8Producer;
4546
import org.apache.arrow.adapter.avro.producers.BaseAvroProducer;
4647
import org.apache.arrow.adapter.avro.producers.CompositeAvroProducer;
48+
import org.apache.arrow.adapter.avro.producers.DictionaryDecodingProducer;
4749
import org.apache.arrow.adapter.avro.producers.Producer;
4850
import org.apache.arrow.adapter.avro.producers.logical.AvroDateDayProducer;
4951
import org.apache.arrow.adapter.avro.producers.logical.AvroDateMilliProducer;
@@ -62,6 +64,7 @@
6264
import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampSecProducer;
6365
import org.apache.arrow.adapter.avro.producers.logical.AvroTimestampSecTzProducer;
6466
import org.apache.arrow.util.Preconditions;
67+
import org.apache.arrow.vector.BaseIntVector;
6568
import org.apache.arrow.vector.BigIntVector;
6669
import org.apache.arrow.vector.BitVector;
6770
import org.apache.arrow.vector.DateDayVector;
@@ -100,7 +103,6 @@
100103
import org.apache.arrow.vector.complex.MapVector;
101104
import org.apache.arrow.vector.complex.StructVector;
102105
import org.apache.arrow.vector.dictionary.Dictionary;
103-
import org.apache.arrow.vector.dictionary.DictionaryEncoder;
104106
import org.apache.arrow.vector.dictionary.DictionaryProvider;
105107
import org.apache.arrow.vector.types.FloatingPointPrecision;
106108
import org.apache.arrow.vector.types.TimeUnit;
@@ -557,9 +559,11 @@ private static BaseAvroProducer<?> createProducer(
557559
}
558560
// If a field is dictionary-encoded but cannot be represented as an Avro enum,
559561
// then decode it before writing
560-
if (!dictionaryIsValidEnum(dictionary)) {
561-
FieldVector decodedVector = (FieldVector) DictionaryEncoder.decode(vector, dictionary);
562-
return createProducer(decodedVector, nullable, dictionaries);
562+
if (dictionaryIsValidEnum(dictionary)) {
563+
return new AvroEnumProducer((BaseIntVector) vector);
564+
} else {
565+
BaseAvroProducer<?> dictProducer = createProducer(dictionary.getVector(), false, null);
566+
return new DictionaryDecodingProducer<>((BaseIntVector) vector, dictProducer);
563567
}
564568
}
565569

adapter/avro/src/main/java/org/apache/arrow/adapter/avro/producers/AvroEnumProducer.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,22 @@
1717
package org.apache.arrow.adapter.avro.producers;
1818

1919
import java.io.IOException;
20-
import org.apache.arrow.vector.IntVector;
20+
import org.apache.arrow.vector.BaseIntVector;
2121
import org.apache.avro.io.Encoder;
2222

2323
/**
24-
* Producer that produces enum values from a dictionary-encoded {@link IntVector}, writes data to an
24+
* Producer that produces enum values from a dictionary-encoded {@link BaseIntVector}, writes data to an
2525
* Avro encoder.
2626
*/
27-
public class AvroEnumProducer extends BaseAvroProducer<IntVector> {
27+
public class AvroEnumProducer extends BaseAvroProducer<BaseIntVector> {
2828

2929
/** Instantiate an AvroEnumProducer. */
30-
public AvroEnumProducer(IntVector vector) {
30+
public AvroEnumProducer(BaseIntVector vector) {
3131
super(vector);
3232
}
3333

3434
@Override
3535
public void produce(Encoder encoder) throws IOException {
36-
encoder.writeEnum(vector.get(currentIndex++));
36+
encoder.writeEnum((int) vector.getValueAsLong(currentIndex++));
3737
}
3838
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.arrow.adapter.avro.producers;
18+
19+
import java.io.IOException;
20+
import org.apache.arrow.vector.BaseIntVector;
21+
import org.apache.arrow.vector.FieldVector;
22+
import org.apache.avro.io.Encoder;
23+
24+
/**
25+
* Producer that produces decoded values from a dictionary-encoded {@link BaseIntVector}, writes data to an
26+
* Avro encoder.
27+
*/
28+
public class DictionaryDecodingProducer<T extends FieldVector>
29+
extends BaseAvroProducer<BaseIntVector> {
30+
31+
private final Producer<T> dictProducer;
32+
33+
/** Instantiate a DictionaryDecodingProducer. */
34+
public DictionaryDecodingProducer(BaseIntVector indexVector, Producer<T> dictProducer) {
35+
super(indexVector);
36+
this.dictProducer = dictProducer;
37+
}
38+
39+
@Override
40+
public void produce(Encoder encoder) throws IOException {
41+
int dicIndex = (int) vector.getValueAsLong(currentIndex++);
42+
dictProducer.setPosition(dicIndex);
43+
dictProducer.produce(encoder);
44+
}
45+
}

0 commit comments

Comments
 (0)