Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.text.TextStringBuilder;
import org.apache.parquet.cli.BaseCommand;
import org.apache.parquet.column.statistics.SizeStatistics;
Expand All @@ -47,6 +49,16 @@ public ShowSizeStatisticsCommand(Logger console) {
@Parameter(description = "<parquet path>")
List<String> targets;

@Parameter(
names = {"-c", "--column", "--columns"},
description = "List of columns (dot paths) to include")
List<String> columns;

@Parameter(
names = {"-r", "--row-group", "--row-groups"},
description = "List of row-group indexes to include (0-based)")
List<Integer> rowGroups;

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Expand All @@ -60,9 +72,13 @@ public int run() throws IOException {

console.info("\nFile path: {}", source);

List<BlockMetaData> rowGroups = footer.getBlocks();
for (int index = 0, n = rowGroups.size(); index < n; index++) {
printRowGroupSizeStats(console, index, rowGroups.get(index), schema);
List<BlockMetaData> blocks = footer.getBlocks();
Set<Integer> allowedRowGroups = rowGroups == null ? null : new HashSet<>(rowGroups);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: If you would create an empty Set instead of initializing with a null, you wouldn't have to check for null in the loop.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previous implementation was using map since we were tracking depth, updated to set thanks!

for (int index = 0, n = blocks.size(); index < n; index++) {
if (allowedRowGroups != null && !allowedRowGroups.contains(index)) {
continue;
}
printRowGroupSizeStats(console, index, blocks.get(index), schema);
console.info("");
}
}
Expand All @@ -84,7 +100,16 @@ private void printRowGroupSizeStats(Logger console, int index, BlockMetaData row
console.info(
String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram"));

Set<String> allowedColumns = null;
if (columns != null && !columns.isEmpty()) {
allowedColumns = new HashSet<>(columns);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See previous comment

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated


for (ColumnChunkMetaData column : rowGroup.getColumns()) {
String dotPath = column.getPath().toDotString();
if (allowedColumns != null && !allowedColumns.contains(dotPath)) {
continue;
}
printColumnSizeStats(console, column, schema, maxColumnWidth);
}
}
Expand All @@ -111,6 +136,12 @@ private void printColumnSizeStats(Logger console, ColumnChunkMetaData column, Me

@Override
public List<String> getExamples() {
return Lists.newArrayList("# Show size statistics for a Parquet file", "sample.parquet");
return Lists.newArrayList(
"# Show size statistics for a Parquet file",
"sample.parquet",
"# Show size statistics for selected columns",
"sample.parquet -c name,tags",
"# Show size statistics for a specific row-group",
"sample.parquet -r 0");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,24 @@ public void testShowSizeStatisticsCommand() throws IOException {
command.setConf(new Configuration());
Assert.assertEquals(0, command.run());
}

@Test
public void testShowSizeStatisticsWithColumnFilter() throws IOException {
File file = parquetFile();
ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger());
command.targets = Arrays.asList(file.getAbsolutePath());
command.columns = Arrays.asList(INT32_FIELD, INT64_FIELD);
command.setConf(new Configuration());
Assert.assertEquals(0, command.run());
}

@Test
public void testShowSizeStatisticsWithRowGroupFilter() throws IOException {
File file = parquetFile();
ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger());
command.targets = Arrays.asList(file.getAbsolutePath());
command.rowGroups = Arrays.asList(0);
command.setConf(new Configuration());
Assert.assertEquals(0, command.run());
}
}