-
Notifications
You must be signed in to change notification settings - Fork 110
Add ParquetSource operator with Java and Spark implementations #528
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
2193e49
Create ParquetSource
5093441
Update Record.java
17759ef
Add readParquet to plan builders
c97fa72
Add Java execution operator to ParquetSource
798ec68
Add SparkSession to executor
37cc8ab
Add Spark execution operator to ParquetSource
0041654
Create WordCount example with parquet file
dcd26d3
Update parquet word count example for projections
4d18151
Add setField and addField to Record
610aef8
Merge remote-tracking branch 'upstream/main'
5c86222
Remove comment
aurb9 0109c02
Merge branch 'main' into main
aurb9 6fa1a24
Merge latest changes
d14fd24
Merge remote-tracking branch 'origin/main'
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
94 changes: 94 additions & 0 deletions
94
wayang-benchmark/src/main/java/org/apache/wayang/apps/wordcount/WordCountParquet.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,94 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.wayang.apps.wordcount; | ||
|
|
||
| import org.apache.wayang.api.JavaPlanBuilder; | ||
| import org.apache.wayang.basic.data.Tuple2; | ||
| import org.apache.wayang.basic.operators.ParquetSource; | ||
| import org.apache.wayang.core.api.WayangContext; | ||
| import org.apache.wayang.java.Java; | ||
| import org.apache.wayang.spark.Spark; | ||
|
|
||
| import java.util.Arrays; | ||
| import java.util.Collection; | ||
|
|
||
| public class WordCountParquet { | ||
|
|
||
| public static void main(String[] args){ | ||
|
|
||
| if (args.length == 0) { | ||
| System.err.print("Usage: <input file URL>"); | ||
| System.exit(1); | ||
| } | ||
|
|
||
| WayangContext wayangContext = new WayangContext(); | ||
| for (String platform : args[0].split(",")) { | ||
| switch (platform) { | ||
| case "java": | ||
| wayangContext.register(Java.basicPlugin()); | ||
| break; | ||
| case "spark": | ||
| wayangContext.register(Spark.basicPlugin()); | ||
| break; | ||
| default: | ||
| System.err.format("Unknown platform: \"%s\"\n", platform); | ||
| System.exit(3); | ||
| return; | ||
| } | ||
| } | ||
|
|
||
| /* Get a plan builder */ | ||
| JavaPlanBuilder planBuilder = new JavaPlanBuilder(wayangContext) | ||
| .withJobName("WordCount") | ||
| .withUdfJarOf(WordCountParquet.class); | ||
|
|
||
| /* Start building the Apache WayangPlan */ | ||
| Collection<Tuple2<String, Integer>> wordcounts = planBuilder | ||
| /* Read the text file */ | ||
| // .readParquet(new ParquetSource(args[1], new String[] { projectionColumns }, Arrays.copyOfRange(args, 2, args.length))) // In case of projection | ||
| .readParquet(new ParquetSource(args[1], null, Arrays.copyOfRange(args, 2, args.length))) | ||
| .withName("Load file") | ||
|
|
||
| /* Split each line by non-word characters */ | ||
| .flatMap(record -> Arrays.asList(record.getString(0).split("\\W+"))) | ||
| .withSelectivity(1, 100, 0.9) | ||
| .withName("Split words") | ||
|
|
||
| /* Filter empty tokens */ | ||
| .filter(token -> !token.isEmpty()) | ||
| .withName("Filter empty words") | ||
|
|
||
| /* Attach counter to each word */ | ||
| .map(word -> new Tuple2<>(word.toLowerCase(), 1)).withName("To lower case, add counter") | ||
|
|
||
| /* Sum up counters for every word */ | ||
| .reduceByKey( | ||
| Tuple2::getField0, | ||
| (t1, t2) -> new Tuple2<>(t1.getField0(), t1.getField1() + t2.getField1()) | ||
| ) | ||
| .withName("Add counters") | ||
|
|
||
| /* Execute the plan and collect the results */ | ||
| .collect(); | ||
|
|
||
| System.out.printf("Found %d words:\n", wordcounts.size()); | ||
| wordcounts.forEach(wc -> System.out.printf("%dx %s\n", wc.field1, wc.field0)); | ||
| } | ||
| } | ||
|
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
178 changes: 178 additions & 0 deletions
178
...g-commons/wayang-basic/src/main/java/org/apache/wayang/basic/operators/ParquetSource.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,178 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.wayang.basic.operators; | ||
|
|
||
| import org.apache.commons.lang3.Validate; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.logging.log4j.LogManager; | ||
| import org.apache.logging.log4j.Logger; | ||
| import org.apache.parquet.hadoop.ParquetFileReader; | ||
| import org.apache.parquet.hadoop.util.HadoopInputFile; | ||
| import org.apache.wayang.basic.data.Record; | ||
| import org.apache.wayang.basic.types.RecordType; | ||
| import org.apache.wayang.commons.util.profiledb.model.measurement.TimeMeasurement; | ||
| import org.apache.wayang.core.api.Configuration; | ||
| import org.apache.wayang.core.optimizer.OptimizationContext; | ||
| import org.apache.wayang.core.optimizer.cardinality.CardinalityEstimate; | ||
| import org.apache.wayang.core.plan.wayangplan.UnarySource; | ||
| import org.apache.wayang.core.types.DataSetType; | ||
| import org.apache.wayang.core.util.fs.FileSystems; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.Optional; | ||
| import java.util.OptionalLong; | ||
|
|
||
| /** | ||
| * This source reads a parquet file and outputs the lines as data units. | ||
| */ | ||
| public class ParquetSource extends UnarySource<Record> { | ||
|
|
||
| private final Logger logger = LogManager.getLogger(this.getClass()); | ||
|
|
||
| private final String inputUrl; | ||
|
|
||
| private final String[] projection; | ||
|
|
||
| /** | ||
| * Creates a new instance. | ||
| * | ||
| * @param inputUrl name of the file to be read | ||
| * @param projection names of the columns to filter; can be omitted but allows for an early projection | ||
| * @param columnNames names of the columns in the tables; can be omitted but allows to inject schema information | ||
| * into Wayang, so as to allow specific optimizations | ||
| */ | ||
| public ParquetSource(String inputUrl, String[] projection, String... columnNames) { | ||
| this(inputUrl, projection, createOutputDataSetType(columnNames)); | ||
| } | ||
|
|
||
| public ParquetSource(String inputUrl, String[] projection, DataSetType<Record> type) { | ||
| super(type); | ||
| this.inputUrl = inputUrl; | ||
| this.projection = projection; | ||
| } | ||
|
|
||
| public String getInputUrl() { return this.inputUrl; } | ||
|
|
||
| public String[] getProjection() { return this.projection; } | ||
|
|
||
| private static DataSetType<Record> createOutputDataSetType(String[] columnNames) { | ||
| return columnNames.length == 0 ? | ||
| DataSetType.createDefault(Record.class) : | ||
| DataSetType.createDefault(new RecordType(columnNames)); | ||
| } | ||
|
|
||
| /** | ||
| * Copies an instance (exclusive of broadcasts). | ||
| * | ||
| * @param that that should be copied | ||
| */ | ||
| public ParquetSource(ParquetSource that) { | ||
| super(that); | ||
| this.inputUrl = that.getInputUrl(); | ||
| this.projection = that.getProjection(); | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<org.apache.wayang.core.optimizer.cardinality.CardinalityEstimator> createCardinalityEstimator( | ||
| final int outputIndex, | ||
| final Configuration configuration) { | ||
| Validate.inclusiveBetween(0, this.getNumOutputs() - 1, outputIndex); | ||
| return Optional.of(new ParquetSource.CardinalityEstimator()); | ||
| } | ||
|
|
||
| /** | ||
| * Custom {@link org.apache.wayang.core.optimizer.cardinality.CardinalityEstimator} for {@link FlatMapOperator}s. | ||
| */ | ||
| protected class CardinalityEstimator implements org.apache.wayang.core.optimizer.cardinality.CardinalityEstimator { | ||
|
|
||
| public final CardinalityEstimate FALLBACK_ESTIMATE = new CardinalityEstimate(1000L, 100000000L, 0.7); | ||
|
|
||
| @Override | ||
| public CardinalityEstimate estimate(OptimizationContext optimizationContext, CardinalityEstimate... inputEstimates) { | ||
| Validate.isTrue(ParquetSource.this.getNumInputs() == inputEstimates.length); | ||
|
|
||
| // see Job for StopWatch measurements | ||
| final TimeMeasurement timeMeasurement = optimizationContext.getJob().getStopWatch().start( | ||
| "Optimization", "Cardinality&Load Estimation", "Push Estimation", "Estimate source cardinalities" | ||
| ); | ||
|
|
||
| // Query the job cache first to see if there is already an estimate. | ||
| String jobCacheKey = String.format("%s.estimate(%s)", this.getClass().getCanonicalName(), ParquetSource.this.inputUrl); | ||
| CardinalityEstimate cardinalityEstimate = optimizationContext.queryJobCache(jobCacheKey, CardinalityEstimate.class); | ||
| if (cardinalityEstimate != null) return cardinalityEstimate; | ||
|
|
||
| // Otherwise calculate the cardinality. | ||
| // First, inspect the size of the file and its line sizes. | ||
| OptionalLong fileSize = FileSystems.getFileSize(ParquetSource.this.inputUrl); | ||
| if (fileSize.isEmpty()) { | ||
| ParquetSource.this.logger.warn("Could not determine size of {}... deliver fallback estimate.", | ||
| ParquetSource.this.inputUrl); | ||
| timeMeasurement.stop(); | ||
| return this.FALLBACK_ESTIMATE; | ||
|
|
||
| } else if (fileSize.getAsLong() == 0L) { | ||
| timeMeasurement.stop(); | ||
| return new CardinalityEstimate(0L, 0L, 1d); | ||
| } | ||
|
|
||
| OptionalLong numberRows = this.extractNumberRows(); | ||
| if (numberRows.isEmpty()) { | ||
| ParquetSource.this.logger.warn("Could not determine the cardinality of {}... deliver fallback estimate.", | ||
| ParquetSource.this.inputUrl); | ||
| timeMeasurement.stop(); | ||
| return this.FALLBACK_ESTIMATE; | ||
| } | ||
|
|
||
| // Create an exact cardinality estimate for the complete file. | ||
| long rowCount = numberRows.getAsLong(); | ||
| cardinalityEstimate = new CardinalityEstimate(rowCount, rowCount, 1d); | ||
|
|
||
| // Cache the result, so that it will not be recalculated again. | ||
| optimizationContext.putIntoJobCache(jobCacheKey, cardinalityEstimate); | ||
|
|
||
| timeMeasurement.stop(); | ||
| return cardinalityEstimate; | ||
| } | ||
|
|
||
| /** | ||
| * Extract the number of rows in the file | ||
| * | ||
| * @return the number of rows in the file | ||
| */ | ||
| private OptionalLong extractNumberRows() { | ||
| Path path = new Path(ParquetSource.this.inputUrl); | ||
|
|
||
| try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new org.apache.hadoop.conf.Configuration()))) { | ||
|
|
||
| long rowCount = reader.getRecordCount(); | ||
|
|
||
| if (rowCount == 0) { | ||
| ParquetSource.this.logger.warn("Could not find any row in {}.", ParquetSource.this.inputUrl); | ||
| return OptionalLong.empty(); | ||
| } | ||
| return OptionalLong.of(rowCount); | ||
| } catch (IOException e) { | ||
| ParquetSource.this.logger.error("Could not extract the number of rows in the input file.", e); | ||
| } | ||
|
|
||
| return OptionalLong.empty(); | ||
| } | ||
| } | ||
|
|
||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.