Skip to content

Commit c780ffa

Browse files
authored
GH-3116: Implement the decoding of Variant values (#3197)
1 parent 66e0c4e commit c780ffa

File tree

7 files changed

+2559
-0
lines changed

7 files changed

+2559
-0
lines changed

parquet-variant/pom.xml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
<!--
2+
~ Licensed to the Apache Software Foundation (ASF) under one
3+
~ or more contributor license agreements. See the NOTICE file
4+
~ distributed with this work for additional information
5+
~ regarding copyright ownership. The ASF licenses this file
6+
~ to you under the Apache License, Version 2.0 (the
7+
~ "License"); you may not use this file except in compliance
8+
~ with the License. You may obtain a copy of the License at
9+
~
10+
~ http://www.apache.org/licenses/LICENSE-2.0
11+
~
12+
~ Unless required by applicable law or agreed to in writing,
13+
~ software distributed under the License is distributed on an
14+
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
~ KIND, either express or implied. See the License for the
16+
~ specific language governing permissions and limitations
17+
~ under the License.
18+
-->
19+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
20+
<parent>
21+
<groupId>org.apache.parquet</groupId>
22+
<artifactId>parquet</artifactId>
23+
<relativePath>../pom.xml</relativePath>
24+
<version>1.16.0-SNAPSHOT</version>
25+
</parent>
26+
27+
<modelVersion>4.0.0</modelVersion>
28+
29+
<artifactId>parquet-variant</artifactId>
30+
<packaging>jar</packaging>
31+
32+
<name>Apache Parquet Variant</name>
33+
<url>https://parquet.apache.org</url>
34+
35+
<properties>
36+
</properties>
37+
38+
<dependencies>
39+
<dependency>
40+
<groupId>com.google.guava</groupId>
41+
<artifactId>guava</artifactId>
42+
<version>${guava.version}</version>
43+
<scope>test</scope>
44+
</dependency>
45+
<dependency>
46+
<groupId>org.slf4j</groupId>
47+
<artifactId>slf4j-api</artifactId>
48+
<version>${slf4j.version}</version>
49+
<scope>test</scope>
50+
</dependency>
51+
</dependencies>
52+
53+
<build>
54+
<plugins>
55+
<plugin>
56+
<groupId>org.apache.maven.plugins</groupId>
57+
<artifactId>maven-jar-plugin</artifactId>
58+
</plugin>
59+
<plugin>
60+
<groupId>org.apache.maven.plugins</groupId>
61+
<artifactId>maven-shade-plugin</artifactId>
62+
</plugin>
63+
</plugins>
64+
</build>
65+
66+
</project>
Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.variant;
20+
21+
import java.math.BigDecimal;
22+
import java.nio.ByteBuffer;
23+
import java.util.UUID;
24+
25+
/**
26+
* This Variant class holds the Variant-encoded value and metadata binary values.
27+
*/
28+
public final class Variant {
29+
/** The buffer that contains the Variant value. */
30+
final ByteBuffer value;
31+
32+
/** The buffer that contains the Variant metadata. */
33+
final ByteBuffer metadata;
34+
35+
/**
36+
* The threshold to switch from linear search to binary search when looking up a field by key in
37+
* an object. This is a performance optimization to avoid the overhead of binary search for a
38+
* short list.
39+
*/
40+
static final int BINARY_SEARCH_THRESHOLD = 32;
41+
42+
public Variant(byte[] value, byte[] metadata) {
43+
this(value, 0, value.length, metadata, 0, metadata.length);
44+
}
45+
46+
public Variant(byte[] value, int valuePos, int valueLength, byte[] metadata, int metadataPos, int metadataLength) {
47+
this(ByteBuffer.wrap(value, valuePos, valueLength), ByteBuffer.wrap(metadata, metadataPos, metadataLength));
48+
}
49+
50+
public Variant(ByteBuffer value, ByteBuffer metadata) {
51+
// The buffers are read a single-byte at a time, so the endianness of the input buffers
52+
// is not important.
53+
this.value = value.asReadOnlyBuffer();
54+
this.metadata = metadata.asReadOnlyBuffer();
55+
56+
// There is currently only one allowed version.
57+
if ((metadata.get(metadata.position()) & VariantUtil.VERSION_MASK) != VariantUtil.VERSION) {
58+
throw new UnsupportedOperationException(String.format(
59+
"Unsupported variant metadata version: %d",
60+
metadata.get(metadata.position()) & VariantUtil.VERSION_MASK));
61+
}
62+
}
63+
64+
/**
65+
* @return the boolean value
66+
*/
67+
public boolean getBoolean() {
68+
return VariantUtil.getBoolean(value);
69+
}
70+
71+
/**
72+
* @return the byte value
73+
*/
74+
public byte getByte() {
75+
return VariantUtil.getByte(value);
76+
}
77+
78+
/**
79+
* @return the short value
80+
*/
81+
public short getShort() {
82+
return VariantUtil.getShort(value);
83+
}
84+
85+
/**
86+
* @return the int value
87+
*/
88+
public int getInt() {
89+
return VariantUtil.getInt(value);
90+
}
91+
92+
/**
93+
* @return the long value
94+
*/
95+
public long getLong() {
96+
return VariantUtil.getLong(value);
97+
}
98+
99+
/**
100+
* @return the double value
101+
*/
102+
public double getDouble() {
103+
return VariantUtil.getDouble(value);
104+
}
105+
106+
/**
107+
* @return the decimal value
108+
*/
109+
public BigDecimal getDecimal() {
110+
return VariantUtil.getDecimal(value);
111+
}
112+
113+
/**
114+
* @return the float value
115+
*/
116+
public float getFloat() {
117+
return VariantUtil.getFloat(value);
118+
}
119+
120+
/**
121+
* @return the binary value
122+
*/
123+
public ByteBuffer getBinary() {
124+
return VariantUtil.getBinary(value);
125+
}
126+
127+
/**
128+
* @return the UUID value
129+
*/
130+
public UUID getUUID() {
131+
return VariantUtil.getUUID(value);
132+
}
133+
134+
/**
135+
* @return the string value
136+
*/
137+
public String getString() {
138+
return VariantUtil.getString(value);
139+
}
140+
141+
/**
142+
* The value type of Variant value. It is determined by the header byte.
143+
*/
144+
public enum Type {
145+
OBJECT,
146+
ARRAY,
147+
NULL,
148+
BOOLEAN,
149+
BYTE,
150+
SHORT,
151+
INT,
152+
LONG,
153+
STRING,
154+
DOUBLE,
155+
DECIMAL4,
156+
DECIMAL8,
157+
DECIMAL16,
158+
DATE,
159+
TIMESTAMP_TZ,
160+
TIMESTAMP_NTZ,
161+
FLOAT,
162+
BINARY,
163+
TIME,
164+
TIMESTAMP_NANOS_TZ,
165+
TIMESTAMP_NANOS_NTZ,
166+
UUID
167+
}
168+
169+
/**
170+
* @return the type of the variant value
171+
*/
172+
public Type getType() {
173+
return VariantUtil.getType(value);
174+
}
175+
176+
/**
177+
* @return the number of object fields in the variant
178+
* @throws IllegalArgumentException if `getType()` does not return `Type.OBJECT`
179+
*/
180+
public int numObjectElements() {
181+
return VariantUtil.getObjectInfo(value).numElements;
182+
}
183+
184+
/**
185+
* Returns the object field Variant value whose key is equal to `key`.
186+
* Returns null if the key is not found.
187+
* @param key the key to look up
188+
* @return the field value whose key is equal to `key`, or null if key is not found
189+
* @throws IllegalArgumentException if `getType()` does not return `Type.OBJECT`
190+
*/
191+
public Variant getFieldByKey(String key) {
192+
VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value);
193+
// Use linear search for a short list. Switch to binary search when the length reaches
194+
// `BINARY_SEARCH_THRESHOLD`.
195+
if (info.numElements < BINARY_SEARCH_THRESHOLD) {
196+
for (int i = 0; i < info.numElements; ++i) {
197+
ObjectField field = getFieldAtIndex(
198+
i,
199+
value,
200+
metadata,
201+
info.idSize,
202+
info.offsetSize,
203+
value.position() + info.idStartOffset,
204+
value.position() + info.offsetStartOffset,
205+
value.position() + info.dataStartOffset);
206+
if (field.key.equals(key)) {
207+
return field.value;
208+
}
209+
}
210+
} else {
211+
int low = 0;
212+
int high = info.numElements - 1;
213+
while (low <= high) {
214+
// Use unsigned right shift to compute the middle of `low` and `high`. This is not only a
215+
// performance optimization, because it can properly handle the case where `low + high`
216+
// overflows int.
217+
int mid = (low + high) >>> 1;
218+
ObjectField field = getFieldAtIndex(
219+
mid,
220+
value,
221+
metadata,
222+
info.idSize,
223+
info.offsetSize,
224+
value.position() + info.idStartOffset,
225+
value.position() + info.offsetStartOffset,
226+
value.position() + info.dataStartOffset);
227+
int cmp = field.key.compareTo(key);
228+
if (cmp < 0) {
229+
low = mid + 1;
230+
} else if (cmp > 0) {
231+
high = mid - 1;
232+
} else {
233+
return field.value;
234+
}
235+
}
236+
}
237+
return null;
238+
}
239+
240+
/**
241+
* A field in a Variant object.
242+
*/
243+
static final class ObjectField {
244+
public final String key;
245+
public final Variant value;
246+
247+
public ObjectField(String key, Variant value) {
248+
this.key = key;
249+
this.value = value;
250+
}
251+
}
252+
253+
private static ObjectField getFieldAtIndex(
254+
int index,
255+
ByteBuffer value,
256+
ByteBuffer metadata,
257+
int idSize,
258+
int offsetSize,
259+
int idStart,
260+
int offsetStart,
261+
int dataStart) {
262+
// idStart, offsetStart, and dataStart are absolute positions in the `value` buffer.
263+
int id = VariantUtil.readUnsigned(value, idStart + idSize * index, idSize);
264+
int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize);
265+
String key = VariantUtil.getMetadataKey(metadata, id);
266+
Variant v = new Variant(VariantUtil.slice(value, dataStart + offset), metadata);
267+
return new ObjectField(key, v);
268+
}
269+
270+
/**
271+
* @return the number of array elements
272+
* @throws IllegalArgumentException if `getType()` does not return `Type.ARRAY`
273+
*/
274+
public int numArrayElements() {
275+
return VariantUtil.getArrayInfo(value).numElements;
276+
}
277+
278+
/**
279+
* Returns the array element Variant value at the `index` slot. Returns null if `index` is
280+
* out of the bound of `[0, arraySize())`.
281+
* @param index the index of the array element to get
282+
* @return the array element Variant at the `index` slot, or null if `index` is out of bounds
283+
* @throws IllegalArgumentException if `getType()` does not return `Type.ARRAY`
284+
*/
285+
public Variant getElementAtIndex(int index) {
286+
VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value);
287+
if (index < 0 || index >= info.numElements) {
288+
return null;
289+
}
290+
return getElementAtIndex(
291+
index,
292+
value,
293+
metadata,
294+
info.offsetSize,
295+
value.position() + info.offsetStartOffset,
296+
value.position() + info.dataStartOffset);
297+
}
298+
299+
private static Variant getElementAtIndex(
300+
int index, ByteBuffer value, ByteBuffer metadata, int offsetSize, int offsetStart, int dataStart) {
301+
// offsetStart and dataStart are absolute positions in the `value` buffer.
302+
int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize);
303+
return new Variant(VariantUtil.slice(value, dataStart + offset), metadata);
304+
}
305+
}

0 commit comments

Comments
 (0)