Skip to content

Commit 22a23d0

Browse files
authored
PARQUET-2417: Add statistics support to geometry logical type (#2971)
1 parent cdcea6d commit 22a23d0

File tree

25 files changed

+3034
-16
lines changed

25 files changed

+3034
-16
lines changed

parquet-cli/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ Usage: parquet [options] [command] [command options]
121121
Rewrite one or more Parquet files to a new Parquet file
122122
size-stats
123123
Print size statistics for a Parquet file
124+
geospatial-stats
125+
Print geospatial statistics for a Parquet file
124126
125127
Examples:
126128

parquet-cli/src/main/java/org/apache/parquet/cli/Main.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import org.apache.parquet.cli.commands.ShowColumnIndexCommand;
5151
import org.apache.parquet.cli.commands.ShowDictionaryCommand;
5252
import org.apache.parquet.cli.commands.ShowFooterCommand;
53+
import org.apache.parquet.cli.commands.ShowGeospatialStatisticsCommand;
5354
import org.apache.parquet.cli.commands.ShowPagesCommand;
5455
import org.apache.parquet.cli.commands.ShowSizeStatisticsCommand;
5556
import org.apache.parquet.cli.commands.ToAvroCommand;
@@ -107,6 +108,7 @@ public class Main extends Configured implements Tool {
107108
jc.addCommand("scan", new ScanCommand(console));
108109
jc.addCommand("rewrite", new RewriteCommand(console));
109110
jc.addCommand("size-stats", new ShowSizeStatisticsCommand(console));
111+
jc.addCommand("geospatial-stats", new ShowGeospatialStatisticsCommand(console));
110112
}
111113

112114
@Override
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.cli.commands;
20+
21+
import com.beust.jcommander.Parameter;
22+
import com.beust.jcommander.Parameters;
23+
import com.google.common.base.Preconditions;
24+
import com.google.common.collect.Lists;
25+
import java.io.IOException;
26+
import java.util.List;
27+
import org.apache.commons.text.TextStringBuilder;
28+
import org.apache.parquet.cli.BaseCommand;
29+
import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics;
30+
import org.apache.parquet.hadoop.ParquetFileReader;
31+
import org.apache.parquet.hadoop.metadata.BlockMetaData;
32+
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
33+
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
34+
import org.apache.parquet.schema.MessageType;
35+
import org.slf4j.Logger;
36+
37+
@Parameters(commandDescription = "Print geospatial statistics for a Parquet file")
38+
public class ShowGeospatialStatisticsCommand extends BaseCommand {
39+
40+
public ShowGeospatialStatisticsCommand(Logger console) {
41+
super(console);
42+
}
43+
44+
@Parameter(description = "<parquet path>")
45+
List<String> targets;
46+
47+
@Override
48+
@SuppressWarnings("unchecked")
49+
public int run() throws IOException {
50+
Preconditions.checkArgument(targets != null && !targets.isEmpty(), "A Parquet file is required.");
51+
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
52+
53+
String source = targets.get(0);
54+
try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
55+
ParquetMetadata footer = reader.getFooter();
56+
MessageType schema = footer.getFileMetaData().getSchema();
57+
58+
console.info("\nFile path: {}", source);
59+
60+
List<BlockMetaData> rowGroups = footer.getBlocks();
61+
for (int index = 0, n = rowGroups.size(); index < n; index++) {
62+
printRowGroupGeospatialStats(console, index, rowGroups.get(index), schema);
63+
console.info("");
64+
}
65+
}
66+
67+
return 0;
68+
}
69+
70+
private void printRowGroupGeospatialStats(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
71+
int maxColumnWidth = Math.max(
72+
"column".length(),
73+
rowGroup.getColumns().stream()
74+
.map(col -> col.getPath().toString().length())
75+
.max(Integer::compare)
76+
.orElse(0));
77+
78+
console.info(String.format("\nRow group %d\n%s", index, new TextStringBuilder(80).appendPadding(80, '-')));
79+
80+
String formatString = String.format("%%-%ds %%-15s %%-40s", maxColumnWidth);
81+
console.info(String.format(formatString, "column", "bounding box", "geospatial types"));
82+
83+
for (ColumnChunkMetaData column : rowGroup.getColumns()) {
84+
printColumnGeospatialStats(console, column, schema, maxColumnWidth);
85+
}
86+
}
87+
88+
private void printColumnGeospatialStats(
89+
Logger console, ColumnChunkMetaData column, MessageType schema, int columnWidth) {
90+
GeospatialStatistics stats = column.getGeospatialStatistics();
91+
92+
if (stats != null && stats.isValid()) {
93+
String boundingBox =
94+
stats.getBoundingBox() != null ? stats.getBoundingBox().toString() : "-";
95+
String geospatialTypes = stats.getGeospatialTypes() != null
96+
? stats.getGeospatialTypes().toString()
97+
: "-";
98+
String formatString = String.format("%%-%ds %%-15s %%-40s", columnWidth);
99+
console.info(String.format(formatString, column.getPath(), boundingBox, geospatialTypes));
100+
} else {
101+
String formatString = String.format("%%-%ds %%-15s %%-40s", columnWidth);
102+
console.info(String.format(formatString, column.getPath(), "-", "-"));
103+
}
104+
}
105+
106+
@Override
107+
public List<String> getExamples() {
108+
return Lists.newArrayList("# Show geospatial statistics for a Parquet file", "sample.parquet");
109+
}
110+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.cli.commands;
20+
21+
import java.io.File;
22+
import java.io.IOException;
23+
import java.util.Arrays;
24+
import org.apache.hadoop.conf.Configuration;
25+
import org.junit.Assert;
26+
import org.junit.Test;
27+
28+
public class ShowGeospatialStatisticsCommandTest extends ParquetFileTest {
29+
@Test
30+
public void testShowGeospatialStatisticsCommand() throws IOException {
31+
File file = parquetFile();
32+
ShowGeospatialStatisticsCommand command = new ShowGeospatialStatisticsCommand(createLogger());
33+
command.targets = Arrays.asList(file.getAbsolutePath());
34+
command.setConf(new Configuration());
35+
Assert.assertEquals(0, command.run());
36+
}
37+
}

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.apache.parquet.column.ParquetProperties;
2727
import org.apache.parquet.column.statistics.SizeStatistics;
2828
import org.apache.parquet.column.statistics.Statistics;
29+
import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics;
2930
import org.apache.parquet.column.values.bloomfilter.AdaptiveBlockSplitBloomFilter;
3031
import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter;
3132
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
@@ -42,6 +43,7 @@ class ColumnValueCollector {
4243
private BloomFilter bloomFilter;
4344
private Statistics<?> statistics;
4445
private SizeStatistics.Builder sizeStatisticsBuilder;
46+
private GeospatialStatistics.Builder geospatialStatisticsBuilder;
4547

4648
ColumnValueCollector(ColumnDescriptor path, BloomFilterWriter bloomFilterWriter, ParquetProperties props) {
4749
this.path = path;
@@ -60,6 +62,9 @@ void resetPageStatistics() {
6062
path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel())
6163
: SizeStatistics.noopBuilder(
6264
path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel());
65+
this.geospatialStatisticsBuilder = statisticsEnabled
66+
? GeospatialStatistics.newBuilder(path.getPrimitiveType())
67+
: GeospatialStatistics.noopBuilder();
6368
}
6469

6570
void writeNull(int repetitionLevel, int definitionLevel) {
@@ -99,6 +104,7 @@ void write(double value, int repetitionLevel, int definitionLevel) {
99104
void write(Binary value, int repetitionLevel, int definitionLevel) {
100105
statistics.updateStats(value);
101106
sizeStatisticsBuilder.add(repetitionLevel, definitionLevel, value);
107+
geospatialStatisticsBuilder.update(value);
102108
bloomFilter.insertHash(bloomFilter.hash(value));
103109
}
104110

@@ -199,4 +205,8 @@ Statistics<?> getStatistics() {
199205
SizeStatistics getSizeStatistics() {
200206
return sizeStatisticsBuilder.build();
201207
}
208+
209+
GeospatialStatistics getGeospatialStatistics() {
210+
return geospatialStatisticsBuilder.build();
211+
}
202212
}

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.parquet.column.page.PageWriter;
2828
import org.apache.parquet.column.statistics.SizeStatistics;
2929
import org.apache.parquet.column.statistics.Statistics;
30+
import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics;
3031
import org.apache.parquet.column.values.ValuesWriter;
3132
import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
3233
import org.apache.parquet.io.ParquetEncodingException;
@@ -380,6 +381,7 @@ void writePage() {
380381
valueCount,
381382
collector.getStatistics(),
382383
collector.getSizeStatistics(),
384+
collector.getGeospatialStatistics(),
383385
repetitionLevelColumn,
384386
definitionLevelColumn,
385387
dataColumn);
@@ -403,6 +405,7 @@ abstract void writePage(
403405
int valueCount,
404406
Statistics<?> statistics,
405407
SizeStatistics sizeStatistics,
408+
GeospatialStatistics geospatialStatistics,
406409
ValuesWriter repetitionLevels,
407410
ValuesWriter definitionLevels,
408411
ValuesWriter values)

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.apache.parquet.column.page.PageWriter;
2727
import org.apache.parquet.column.statistics.SizeStatistics;
2828
import org.apache.parquet.column.statistics.Statistics;
29+
import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics;
2930
import org.apache.parquet.column.values.ValuesWriter;
3031
import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
3132

@@ -62,6 +63,7 @@ void writePage(
6263
int valueCount,
6364
Statistics<?> statistics,
6465
SizeStatistics sizeStatistics,
66+
GeospatialStatistics geospatialStatistics,
6567
ValuesWriter repetitionLevels,
6668
ValuesWriter definitionLevels,
6769
ValuesWriter values)
@@ -72,6 +74,7 @@ void writePage(
7274
rowCount,
7375
statistics,
7476
sizeStatistics,
77+
geospatialStatistics,
7578
repetitionLevels.getEncoding(),
7679
definitionLevels.getEncoding(),
7780
values.getEncoding());

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.apache.parquet.column.page.PageWriter;
2727
import org.apache.parquet.column.statistics.SizeStatistics;
2828
import org.apache.parquet.column.statistics.Statistics;
29+
import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics;
2930
import org.apache.parquet.column.values.ValuesWriter;
3031
import org.apache.parquet.column.values.bitpacking.DevNullValuesWriter;
3132
import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
@@ -88,6 +89,7 @@ void writePage(
8889
int valueCount,
8990
Statistics<?> statistics,
9091
SizeStatistics sizeStatistics,
92+
GeospatialStatistics geospatialStatistics,
9193
ValuesWriter repetitionLevels,
9294
ValuesWriter definitionLevels,
9395
ValuesWriter values)
@@ -105,6 +107,7 @@ void writePage(
105107
encoding,
106108
bytes,
107109
statistics,
108-
sizeStatistics);
110+
sizeStatistics,
111+
geospatialStatistics);
109112
}
110113
}

parquet-column/src/main/java/org/apache/parquet/column/page/PageWriter.java

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.parquet.column.Encoding;
2424
import org.apache.parquet.column.statistics.SizeStatistics;
2525
import org.apache.parquet.column.statistics.Statistics;
26+
import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics;
2627

2728
/**
2829
* a writer for all the pages of a given column chunk
@@ -86,6 +87,7 @@ void writePage(
8687
* @param valuesEncoding values encoding
8788
* @throws IOException
8889
*/
90+
@Deprecated
8991
default void writePage(
9092
BytesInput bytesInput,
9193
int valueCount,
@@ -99,6 +101,33 @@ default void writePage(
99101
throw new UnsupportedOperationException("writePage with SizeStatistics is not implemented");
100102
}
101103

104+
/**
105+
* writes a single page
106+
* @param bytesInput the bytes for the page
107+
* @param valueCount the number of values in that page
108+
* @param rowCount the number of rows in that page
109+
* @param statistics the statistics for that page
110+
* @param sizeStatistics the size statistics for that page
111+
* @param geospatialStatistics the geospatial statistics for that page
112+
* @param rlEncoding repetition level encoding
113+
* @param dlEncoding definition level encoding
114+
* @param valuesEncoding values encoding
115+
* @throws IOException
116+
*/
117+
default void writePage(
118+
BytesInput bytesInput,
119+
int valueCount,
120+
int rowCount,
121+
Statistics<?> statistics,
122+
SizeStatistics sizeStatistics,
123+
GeospatialStatistics geospatialStatistics,
124+
Encoding rlEncoding,
125+
Encoding dlEncoding,
126+
Encoding valuesEncoding)
127+
throws IOException {
128+
throw new UnsupportedOperationException("writePage with GeospatialStatistics is not implemented");
129+
}
130+
102131
/**
103132
* writes a single page in the new format
104133
*
@@ -136,6 +165,7 @@ void writePageV2(
136165
* @param sizeStatistics optional size stats for this page
137166
* @throws IOException if there is an exception while writing page data
138167
*/
168+
@Deprecated
139169
default void writePageV2(
140170
int rowCount,
141171
int nullCount,
@@ -150,6 +180,34 @@ default void writePageV2(
150180
throw new UnsupportedOperationException("writePageV2 with SizeStatistics is not implemented");
151181
}
152182

183+
/**
184+
* writes a single page in the new format
185+
* @param rowCount the number of rows in this page
186+
* @param nullCount the number of null values (out of valueCount)
187+
* @param valueCount the number of values in that page (there could be multiple values per row for repeated fields)
188+
* @param repetitionLevels the repetition levels encoded in RLE without any size header
189+
* @param definitionLevels the definition levels encoded in RLE without any size header
190+
* @param dataEncoding the encoding for the data
191+
* @param data the data encoded with dataEncoding
192+
* @param statistics optional stats for this page
193+
* @param sizeStatistics optional size stats for this page
194+
* @throws IOException if there is an exception while writing page data
195+
*/
196+
default void writePageV2(
197+
int rowCount,
198+
int nullCount,
199+
int valueCount,
200+
BytesInput repetitionLevels,
201+
BytesInput definitionLevels,
202+
Encoding dataEncoding,
203+
BytesInput data,
204+
Statistics<?> statistics,
205+
SizeStatistics sizeStatistics,
206+
GeospatialStatistics geospatialStatistics)
207+
throws IOException {
208+
throw new UnsupportedOperationException("writePageV2 with GeospatialStatistics is not implemented");
209+
}
210+
153211
/**
154212
* @return the current size used in the memory buffer for that column chunk
155213
*/

0 commit comments

Comments
 (0)